LLVM 19.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
20#include "llvm/IR/Intrinsics.h"
21#include "llvm/IR/IntrinsicsAArch64.h"
23#include "llvm/Support/Debug.h"
26#include <algorithm>
27#include <optional>
28using namespace llvm;
29using namespace llvm::PatternMatch;
30
31#define DEBUG_TYPE "aarch64tti"
32
33static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
34 cl::init(true), cl::Hidden);
35
36static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
38
39static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
40 cl::init(10), cl::Hidden);
41
42static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
43 cl::init(15), cl::Hidden);
44
46 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
48
50 "call-penalty-sm-change", cl::init(5), cl::Hidden,
52 "Penalty of calling a function that requires a change to PSTATE.SM"));
53
55 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
56 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
57
58static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
59 cl::init(true), cl::Hidden);
60
61static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
62 cl::init(true), cl::Hidden);
63
64// A complete guess as to a reasonable cost.
66 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
67 cl::desc("The cost of a histcnt instruction"));
68
69namespace {
70class TailFoldingOption {
71 // These bitfields will only ever be set to something non-zero in operator=,
72 // when setting the -sve-tail-folding option. This option should always be of
73 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
74 // InitialBits is one of (disabled|all|simple). EnableBits represents
75 // additional flags we're enabling, and DisableBits for those flags we're
76 // disabling. The default flag is tracked in the variable NeedsDefault, since
77 // at the time of setting the option we may not know what the default value
78 // for the CPU is.
79 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
80 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
81 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
82
83 // This value needs to be initialised to true in case the user does not
84 // explicitly set the -sve-tail-folding option.
85 bool NeedsDefault = true;
86
87 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
88
89 void setNeedsDefault(bool V) { NeedsDefault = V; }
90
91 void setEnableBit(TailFoldingOpts Bit) {
92 EnableBits |= Bit;
93 DisableBits &= ~Bit;
94 }
95
96 void setDisableBit(TailFoldingOpts Bit) {
97 EnableBits &= ~Bit;
98 DisableBits |= Bit;
99 }
100
101 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
102 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
103
104 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
105 "Initial bits should only include one of "
106 "(disabled|all|simple|default)");
107 Bits = NeedsDefault ? DefaultBits : InitialBits;
108 Bits |= EnableBits;
109 Bits &= ~DisableBits;
110
111 return Bits;
112 }
113
114 void reportError(std::string Opt) {
115 errs() << "invalid argument '" << Opt
116 << "' to -sve-tail-folding=; the option should be of the form\n"
117 " (disabled|all|default|simple)[+(reductions|recurrences"
118 "|reverse|noreductions|norecurrences|noreverse)]\n";
119 report_fatal_error("Unrecognised tail-folding option");
120 }
121
122public:
123
124 void operator=(const std::string &Val) {
125 // If the user explicitly sets -sve-tail-folding= then treat as an error.
126 if (Val.empty()) {
127 reportError("");
128 return;
129 }
130
131 // Since the user is explicitly setting the option we don't automatically
132 // need the default unless they require it.
133 setNeedsDefault(false);
134
135 SmallVector<StringRef, 4> TailFoldTypes;
136 StringRef(Val).split(TailFoldTypes, '+', -1, false);
137
138 unsigned StartIdx = 1;
139 if (TailFoldTypes[0] == "disabled")
140 setInitialBits(TailFoldingOpts::Disabled);
141 else if (TailFoldTypes[0] == "all")
142 setInitialBits(TailFoldingOpts::All);
143 else if (TailFoldTypes[0] == "default")
144 setNeedsDefault(true);
145 else if (TailFoldTypes[0] == "simple")
146 setInitialBits(TailFoldingOpts::Simple);
147 else {
148 StartIdx = 0;
149 setInitialBits(TailFoldingOpts::Disabled);
150 }
151
152 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
153 if (TailFoldTypes[I] == "reductions")
154 setEnableBit(TailFoldingOpts::Reductions);
155 else if (TailFoldTypes[I] == "recurrences")
156 setEnableBit(TailFoldingOpts::Recurrences);
157 else if (TailFoldTypes[I] == "reverse")
158 setEnableBit(TailFoldingOpts::Reverse);
159 else if (TailFoldTypes[I] == "noreductions")
160 setDisableBit(TailFoldingOpts::Reductions);
161 else if (TailFoldTypes[I] == "norecurrences")
162 setDisableBit(TailFoldingOpts::Recurrences);
163 else if (TailFoldTypes[I] == "noreverse")
164 setDisableBit(TailFoldingOpts::Reverse);
165 else
166 reportError(Val);
167 }
168 }
169
170 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
171 return (getBits(DefaultBits) & Required) == Required;
172 }
173};
174} // namespace
175
176TailFoldingOption TailFoldingOptionLoc;
177
179 "sve-tail-folding",
180 cl::desc(
181 "Control the use of vectorisation using tail-folding for SVE where the"
182 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
183 "\ndisabled (Initial) No loop types will vectorize using "
184 "tail-folding"
185 "\ndefault (Initial) Uses the default tail-folding settings for "
186 "the target CPU"
187 "\nall (Initial) All legal loop types will vectorize using "
188 "tail-folding"
189 "\nsimple (Initial) Use tail-folding for simple loops (not "
190 "reductions or recurrences)"
191 "\nreductions Use tail-folding for loops containing reductions"
192 "\nnoreductions Inverse of above"
193 "\nrecurrences Use tail-folding for loops containing fixed order "
194 "recurrences"
195 "\nnorecurrences Inverse of above"
196 "\nreverse Use tail-folding for loops requiring reversed "
197 "predicates"
198 "\nnoreverse Inverse of above"),
200
201// Experimental option that will only be fully functional when the
202// code-generator is changed to use SVE instead of NEON for all fixed-width
203// operations.
205 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
206
207// Experimental option that will only be fully functional when the cost-model
208// and code-generator have been changed to avoid using scalable vector
209// instructions that are not legal in streaming SVE mode.
211 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
212
213static bool isSMEABIRoutineCall(const CallInst &CI) {
214 const auto *F = CI.getCalledFunction();
215 return F && StringSwitch<bool>(F->getName())
216 .Case("__arm_sme_state", true)
217 .Case("__arm_tpidr2_save", true)
218 .Case("__arm_tpidr2_restore", true)
219 .Case("__arm_za_disable", true)
220 .Default(false);
221}
222
223/// Returns true if the function has explicit operations that can only be
224/// lowered using incompatible instructions for the selected mode. This also
225/// returns true if the function F may use or modify ZA state.
227 for (const BasicBlock &BB : *F) {
228 for (const Instruction &I : BB) {
229 // Be conservative for now and assume that any call to inline asm or to
230 // intrinsics could could result in non-streaming ops (e.g. calls to
231 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
232 // all native LLVM instructions can be lowered to compatible instructions.
233 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
234 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
235 isSMEABIRoutineCall(cast<CallInst>(I))))
236 return true;
237 }
238 }
239 return false;
240}
241
243 const Function *Callee) const {
244 SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
245
246 // When inlining, we should consider the body of the function, not the
247 // interface.
248 if (CalleeAttrs.hasStreamingBody()) {
249 CalleeAttrs.set(SMEAttrs::SM_Compatible, false);
250 CalleeAttrs.set(SMEAttrs::SM_Enabled, true);
251 }
252
253 if (CalleeAttrs.isNewZA())
254 return false;
255
256 if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
257 CallerAttrs.requiresSMChange(CalleeAttrs)) {
258 if (hasPossibleIncompatibleOps(Callee))
259 return false;
260 }
261
262 const TargetMachine &TM = getTLI()->getTargetMachine();
263
264 const FeatureBitset &CallerBits =
265 TM.getSubtargetImpl(*Caller)->getFeatureBits();
266 const FeatureBitset &CalleeBits =
267 TM.getSubtargetImpl(*Callee)->getFeatureBits();
268
269 // Inline a callee if its target-features are a subset of the callers
270 // target-features.
271 return (CallerBits & CalleeBits) == CalleeBits;
272}
273
275 const Function *Caller, const Function *Callee,
276 const ArrayRef<Type *> &Types) const {
277 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
278 return false;
279
280 // We need to ensure that argument promotion does not attempt to promote
281 // pointers to fixed-length vector types larger than 128 bits like
282 // <8 x float> (and pointers to aggregate types which have such fixed-length
283 // vector type members) into the values of the pointees. Such vector types
284 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
285 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
286 // types can be safely treated as 128-bit NEON types and they cannot be
287 // distinguished in IR.
288 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
289 auto FVTy = dyn_cast<FixedVectorType>(Ty);
290 return FVTy &&
291 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
292 }))
293 return false;
294
295 return true;
296}
297
298unsigned
300 unsigned DefaultCallPenalty) const {
301 // This function calculates a penalty for executing Call in F.
302 //
303 // There are two ways this function can be called:
304 // (1) F:
305 // call from F -> G (the call here is Call)
306 //
307 // For (1), Call.getCaller() == F, so it will always return a high cost if
308 // a streaming-mode change is required (thus promoting the need to inline the
309 // function)
310 //
311 // (2) F:
312 // call from F -> G (the call here is not Call)
313 // G:
314 // call from G -> H (the call here is Call)
315 //
316 // For (2), if after inlining the body of G into F the call to H requires a
317 // streaming-mode change, and the call to G from F would also require a
318 // streaming-mode change, then there is benefit to do the streaming-mode
319 // change only once and avoid inlining of G into F.
320 SMEAttrs FAttrs(*F);
321 SMEAttrs CalleeAttrs(Call);
322 if (FAttrs.requiresSMChange(CalleeAttrs)) {
323 if (F == Call.getCaller()) // (1)
324 return CallPenaltyChangeSM * DefaultCallPenalty;
325 if (FAttrs.requiresSMChange(SMEAttrs(*Call.getCaller()))) // (2)
326 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
327 }
328
329 return DefaultCallPenalty;
330}
331
336 ST->isNeonAvailable());
337}
338
339/// Calculate the cost of materializing a 64-bit value. This helper
340/// method might only calculate a fraction of a larger immediate. Therefore it
341/// is valid to return a cost of ZERO.
343 // Check if the immediate can be encoded within an instruction.
344 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
345 return 0;
346
347 if (Val < 0)
348 Val = ~Val;
349
350 // Calculate how many moves we will need to materialize this constant.
353 return Insn.size();
354}
355
356/// Calculate the cost of materializing the given constant.
359 assert(Ty->isIntegerTy());
360
361 unsigned BitSize = Ty->getPrimitiveSizeInBits();
362 if (BitSize == 0)
363 return ~0U;
364
365 // Sign-extend all constants to a multiple of 64-bit.
366 APInt ImmVal = Imm;
367 if (BitSize & 0x3f)
368 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
369
370 // Split the constant into 64-bit chunks and calculate the cost for each
371 // chunk.
373 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
374 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
375 int64_t Val = Tmp.getSExtValue();
376 Cost += getIntImmCost(Val);
377 }
378 // We need at least one instruction to materialze the constant.
379 return std::max<InstructionCost>(1, Cost);
380}
381
383 const APInt &Imm, Type *Ty,
385 Instruction *Inst) {
386 assert(Ty->isIntegerTy());
387
388 unsigned BitSize = Ty->getPrimitiveSizeInBits();
389 // There is no cost model for constants with a bit size of 0. Return TCC_Free
390 // here, so that constant hoisting will ignore this constant.
391 if (BitSize == 0)
392 return TTI::TCC_Free;
393
394 unsigned ImmIdx = ~0U;
395 switch (Opcode) {
396 default:
397 return TTI::TCC_Free;
398 case Instruction::GetElementPtr:
399 // Always hoist the base address of a GetElementPtr.
400 if (Idx == 0)
401 return 2 * TTI::TCC_Basic;
402 return TTI::TCC_Free;
403 case Instruction::Store:
404 ImmIdx = 0;
405 break;
406 case Instruction::Add:
407 case Instruction::Sub:
408 case Instruction::Mul:
409 case Instruction::UDiv:
410 case Instruction::SDiv:
411 case Instruction::URem:
412 case Instruction::SRem:
413 case Instruction::And:
414 case Instruction::Or:
415 case Instruction::Xor:
416 case Instruction::ICmp:
417 ImmIdx = 1;
418 break;
419 // Always return TCC_Free for the shift value of a shift instruction.
420 case Instruction::Shl:
421 case Instruction::LShr:
422 case Instruction::AShr:
423 if (Idx == 1)
424 return TTI::TCC_Free;
425 break;
426 case Instruction::Trunc:
427 case Instruction::ZExt:
428 case Instruction::SExt:
429 case Instruction::IntToPtr:
430 case Instruction::PtrToInt:
431 case Instruction::BitCast:
432 case Instruction::PHI:
433 case Instruction::Call:
434 case Instruction::Select:
435 case Instruction::Ret:
436 case Instruction::Load:
437 break;
438 }
439
440 if (Idx == ImmIdx) {
441 int NumConstants = (BitSize + 63) / 64;
443 return (Cost <= NumConstants * TTI::TCC_Basic)
444 ? static_cast<int>(TTI::TCC_Free)
445 : Cost;
446 }
448}
449
452 const APInt &Imm, Type *Ty,
454 assert(Ty->isIntegerTy());
455
456 unsigned BitSize = Ty->getPrimitiveSizeInBits();
457 // There is no cost model for constants with a bit size of 0. Return TCC_Free
458 // here, so that constant hoisting will ignore this constant.
459 if (BitSize == 0)
460 return TTI::TCC_Free;
461
462 // Most (all?) AArch64 intrinsics do not support folding immediates into the
463 // selected instruction, so we compute the materialization cost for the
464 // immediate directly.
465 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
467
468 switch (IID) {
469 default:
470 return TTI::TCC_Free;
471 case Intrinsic::sadd_with_overflow:
472 case Intrinsic::uadd_with_overflow:
473 case Intrinsic::ssub_with_overflow:
474 case Intrinsic::usub_with_overflow:
475 case Intrinsic::smul_with_overflow:
476 case Intrinsic::umul_with_overflow:
477 if (Idx == 1) {
478 int NumConstants = (BitSize + 63) / 64;
480 return (Cost <= NumConstants * TTI::TCC_Basic)
481 ? static_cast<int>(TTI::TCC_Free)
482 : Cost;
483 }
484 break;
485 case Intrinsic::experimental_stackmap:
486 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
487 return TTI::TCC_Free;
488 break;
489 case Intrinsic::experimental_patchpoint_void:
490 case Intrinsic::experimental_patchpoint:
491 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
492 return TTI::TCC_Free;
493 break;
494 case Intrinsic::experimental_gc_statepoint:
495 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
496 return TTI::TCC_Free;
497 break;
498 }
500}
501
504 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
505 if (TyWidth == 32 || TyWidth == 64)
507 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
508 return TTI::PSK_Software;
509}
510
511static bool isUnpackedVectorVT(EVT VecVT) {
512 return VecVT.isScalableVector() &&
514}
515
517 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
518 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
519
520 // Only allow (32b and 64b) integers or pointers for now...
521 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) ||
522 (EltTy->getScalarSizeInBits() != 32 &&
523 EltTy->getScalarSizeInBits() != 64))
525
526 // FIXME: Hacky check for legal vector types. We can promote smaller types
527 // but we cannot legalize vectors via splitting for histcnt.
528 // FIXME: We should be able to generate histcnt for fixed-length vectors
529 // using ptrue with a specific VL.
530 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy))
531 if ((VTy->getElementCount().getKnownMinValue() != 2 &&
532 VTy->getElementCount().getKnownMinValue() != 4) ||
533 VTy->getPrimitiveSizeInBits().getKnownMinValue() > 128 ||
534 !VTy->isScalableTy())
536
538}
539
543 auto *RetTy = ICA.getReturnType();
544 switch (ICA.getID()) {
545 case Intrinsic::experimental_vector_histogram_add:
546 if (!ST->hasSVE2())
548 return getHistogramCost(ICA);
549 case Intrinsic::umin:
550 case Intrinsic::umax:
551 case Intrinsic::smin:
552 case Intrinsic::smax: {
553 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
554 MVT::v8i16, MVT::v2i32, MVT::v4i32,
555 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
556 MVT::nxv2i64};
558 // v2i64 types get converted to cmp+bif hence the cost of 2
559 if (LT.second == MVT::v2i64)
560 return LT.first * 2;
561 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
562 return LT.first;
563 break;
564 }
565 case Intrinsic::sadd_sat:
566 case Intrinsic::ssub_sat:
567 case Intrinsic::uadd_sat:
568 case Intrinsic::usub_sat: {
569 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
570 MVT::v8i16, MVT::v2i32, MVT::v4i32,
571 MVT::v2i64};
573 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
574 // need to extend the type, as it uses shr(qadd(shl, shl)).
575 unsigned Instrs =
576 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
577 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
578 return LT.first * Instrs;
579 break;
580 }
581 case Intrinsic::abs: {
582 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
583 MVT::v8i16, MVT::v2i32, MVT::v4i32,
584 MVT::v2i64};
586 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
587 return LT.first;
588 break;
589 }
590 case Intrinsic::bswap: {
591 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
592 MVT::v4i32, MVT::v2i64};
594 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
595 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
596 return LT.first;
597 break;
598 }
599 case Intrinsic::experimental_stepvector: {
600 InstructionCost Cost = 1; // Cost of the `index' instruction
602 // Legalisation of illegal vectors involves an `index' instruction plus
603 // (LT.first - 1) vector adds.
604 if (LT.first > 1) {
605 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
606 InstructionCost AddCost =
607 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
608 Cost += AddCost * (LT.first - 1);
609 }
610 return Cost;
611 }
612 case Intrinsic::vector_extract:
613 case Intrinsic::vector_insert: {
614 // If both the vector and subvector types are legal types and the index
615 // is 0, then this should be a no-op or simple operation; return a
616 // relatively low cost.
617
618 // If arguments aren't actually supplied, then we cannot determine the
619 // value of the index. We also want to skip predicate types.
620 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
622 break;
623
624 LLVMContext &C = RetTy->getContext();
625 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
626 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
627 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
628 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
629 // Skip this if either the vector or subvector types are unpacked
630 // SVE types; they may get lowered to stack stores and loads.
631 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
632 break;
633
635 getTLI()->getTypeConversion(C, SubVecVT);
637 getTLI()->getTypeConversion(C, VecVT);
638 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
639 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
640 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
641 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
642 return TTI::TCC_Free;
643 break;
644 }
645 case Intrinsic::bitreverse: {
646 static const CostTblEntry BitreverseTbl[] = {
647 {Intrinsic::bitreverse, MVT::i32, 1},
648 {Intrinsic::bitreverse, MVT::i64, 1},
649 {Intrinsic::bitreverse, MVT::v8i8, 1},
650 {Intrinsic::bitreverse, MVT::v16i8, 1},
651 {Intrinsic::bitreverse, MVT::v4i16, 2},
652 {Intrinsic::bitreverse, MVT::v8i16, 2},
653 {Intrinsic::bitreverse, MVT::v2i32, 2},
654 {Intrinsic::bitreverse, MVT::v4i32, 2},
655 {Intrinsic::bitreverse, MVT::v1i64, 2},
656 {Intrinsic::bitreverse, MVT::v2i64, 2},
657 };
658 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
659 const auto *Entry =
660 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
661 if (Entry) {
662 // Cost Model is using the legal type(i32) that i8 and i16 will be
663 // converted to +1 so that we match the actual lowering cost
664 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
665 TLI->getValueType(DL, RetTy, true) == MVT::i16)
666 return LegalisationCost.first * Entry->Cost + 1;
667
668 return LegalisationCost.first * Entry->Cost;
669 }
670 break;
671 }
672 case Intrinsic::ctpop: {
673 if (!ST->hasNEON()) {
674 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
675 return getTypeLegalizationCost(RetTy).first * 12;
676 }
677 static const CostTblEntry CtpopCostTbl[] = {
678 {ISD::CTPOP, MVT::v2i64, 4},
679 {ISD::CTPOP, MVT::v4i32, 3},
680 {ISD::CTPOP, MVT::v8i16, 2},
681 {ISD::CTPOP, MVT::v16i8, 1},
682 {ISD::CTPOP, MVT::i64, 4},
683 {ISD::CTPOP, MVT::v2i32, 3},
684 {ISD::CTPOP, MVT::v4i16, 2},
685 {ISD::CTPOP, MVT::v8i8, 1},
686 {ISD::CTPOP, MVT::i32, 5},
687 };
689 MVT MTy = LT.second;
690 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
691 // Extra cost of +1 when illegal vector types are legalized by promoting
692 // the integer type.
693 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
694 RetTy->getScalarSizeInBits()
695 ? 1
696 : 0;
697 return LT.first * Entry->Cost + ExtraCost;
698 }
699 break;
700 }
701 case Intrinsic::sadd_with_overflow:
702 case Intrinsic::uadd_with_overflow:
703 case Intrinsic::ssub_with_overflow:
704 case Intrinsic::usub_with_overflow:
705 case Intrinsic::smul_with_overflow:
706 case Intrinsic::umul_with_overflow: {
707 static const CostTblEntry WithOverflowCostTbl[] = {
708 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
709 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
710 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
711 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
712 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
713 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
714 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
715 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
716 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
717 {Intrinsic::usub_with_overflow, MVT::i8, 3},
718 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
719 {Intrinsic::usub_with_overflow, MVT::i16, 3},
720 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
721 {Intrinsic::usub_with_overflow, MVT::i32, 1},
722 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
723 {Intrinsic::usub_with_overflow, MVT::i64, 1},
724 {Intrinsic::smul_with_overflow, MVT::i8, 5},
725 {Intrinsic::umul_with_overflow, MVT::i8, 4},
726 {Intrinsic::smul_with_overflow, MVT::i16, 5},
727 {Intrinsic::umul_with_overflow, MVT::i16, 4},
728 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
729 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
730 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
731 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
732 };
733 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
734 if (MTy.isSimple())
735 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
736 MTy.getSimpleVT()))
737 return Entry->Cost;
738 break;
739 }
740 case Intrinsic::fptosi_sat:
741 case Intrinsic::fptoui_sat: {
742 if (ICA.getArgTypes().empty())
743 break;
744 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
745 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
746 EVT MTy = TLI->getValueType(DL, RetTy);
747 // Check for the legal types, which are where the size of the input and the
748 // output are the same, or we are using cvt f64->i32 or f32->i64.
749 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
750 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
751 LT.second == MVT::v2f64) &&
752 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
753 (LT.second == MVT::f64 && MTy == MVT::i32) ||
754 (LT.second == MVT::f32 && MTy == MVT::i64)))
755 return LT.first;
756 // Similarly for fp16 sizes
757 if (ST->hasFullFP16() &&
758 ((LT.second == MVT::f16 && MTy == MVT::i32) ||
759 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
760 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
761 return LT.first;
762
763 // Otherwise we use a legal convert followed by a min+max
764 if ((LT.second.getScalarType() == MVT::f32 ||
765 LT.second.getScalarType() == MVT::f64 ||
766 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
767 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
768 Type *LegalTy =
769 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
770 if (LT.second.isVector())
771 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
773 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
774 LegalTy, {LegalTy, LegalTy});
776 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
777 LegalTy, {LegalTy, LegalTy});
779 return LT.first * Cost;
780 }
781 break;
782 }
783 case Intrinsic::fshl:
784 case Intrinsic::fshr: {
785 if (ICA.getArgs().empty())
786 break;
787
788 // TODO: Add handling for fshl where third argument is not a constant.
789 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
790 if (!OpInfoZ.isConstant())
791 break;
792
793 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
794 if (OpInfoZ.isUniform()) {
795 // FIXME: The costs could be lower if the codegen is better.
796 static const CostTblEntry FshlTbl[] = {
797 {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
798 {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
799 {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
800 {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}};
801 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
802 // to avoid having to duplicate the costs.
803 const auto *Entry =
804 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
805 if (Entry)
806 return LegalisationCost.first * Entry->Cost;
807 }
808
809 auto TyL = getTypeLegalizationCost(RetTy);
810 if (!RetTy->isIntegerTy())
811 break;
812
813 // Estimate cost manually, as types like i8 and i16 will get promoted to
814 // i32 and CostTableLookup will ignore the extra conversion cost.
815 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
816 RetTy->getScalarSizeInBits() < 64) ||
817 (RetTy->getScalarSizeInBits() % 64 != 0);
818 unsigned ExtraCost = HigherCost ? 1 : 0;
819 if (RetTy->getScalarSizeInBits() == 32 ||
820 RetTy->getScalarSizeInBits() == 64)
821 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
822 // extr instruction.
823 else if (HigherCost)
824 ExtraCost = 1;
825 else
826 break;
827 return TyL.first + ExtraCost;
828 }
829 case Intrinsic::get_active_lane_mask: {
830 auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
831 if (RetTy) {
832 EVT RetVT = getTLI()->getValueType(DL, RetTy);
833 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
834 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
835 !getTLI()->isTypeLegal(RetVT)) {
836 // We don't have enough context at this point to determine if the mask
837 // is going to be kept live after the block, which will force the vXi1
838 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
839 // For now, we just assume the vectorizer created this intrinsic and
840 // the result will be the input for a PHI. In this case the cost will
841 // be extremely high for fixed-width vectors.
842 // NOTE: getScalarizationOverhead returns a cost that's far too
843 // pessimistic for the actual generated codegen. In reality there are
844 // two instructions generated per lane.
845 return RetTy->getNumElements() * 2;
846 }
847 }
848 break;
849 }
850 default:
851 break;
852 }
854}
855
856/// The function will remove redundant reinterprets casting in the presence
857/// of the control flow
858static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
859 IntrinsicInst &II) {
861 auto RequiredType = II.getType();
862
863 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
864 assert(PN && "Expected Phi Node!");
865
866 // Don't create a new Phi unless we can remove the old one.
867 if (!PN->hasOneUse())
868 return std::nullopt;
869
870 for (Value *IncValPhi : PN->incoming_values()) {
871 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
872 if (!Reinterpret ||
873 Reinterpret->getIntrinsicID() !=
874 Intrinsic::aarch64_sve_convert_to_svbool ||
875 RequiredType != Reinterpret->getArgOperand(0)->getType())
876 return std::nullopt;
877 }
878
879 // Create the new Phi
880 IC.Builder.SetInsertPoint(PN);
881 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
882 Worklist.push_back(PN);
883
884 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
885 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
886 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
887 Worklist.push_back(Reinterpret);
888 }
889
890 // Cleanup Phi Node and reinterprets
891 return IC.replaceInstUsesWith(II, NPN);
892}
893
894// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
895// => (binop (pred) (from_svbool _) (from_svbool _))
896//
897// The above transformation eliminates a `to_svbool` in the predicate
898// operand of bitwise operation `binop` by narrowing the vector width of
899// the operation. For example, it would convert a `<vscale x 16 x i1>
900// and` into a `<vscale x 4 x i1> and`. This is profitable because
901// to_svbool must zero the new lanes during widening, whereas
902// from_svbool is free.
903static std::optional<Instruction *>
905 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
906 if (!BinOp)
907 return std::nullopt;
908
909 auto IntrinsicID = BinOp->getIntrinsicID();
910 switch (IntrinsicID) {
911 case Intrinsic::aarch64_sve_and_z:
912 case Intrinsic::aarch64_sve_bic_z:
913 case Intrinsic::aarch64_sve_eor_z:
914 case Intrinsic::aarch64_sve_nand_z:
915 case Intrinsic::aarch64_sve_nor_z:
916 case Intrinsic::aarch64_sve_orn_z:
917 case Intrinsic::aarch64_sve_orr_z:
918 break;
919 default:
920 return std::nullopt;
921 }
922
923 auto BinOpPred = BinOp->getOperand(0);
924 auto BinOpOp1 = BinOp->getOperand(1);
925 auto BinOpOp2 = BinOp->getOperand(2);
926
927 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
928 if (!PredIntr ||
929 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
930 return std::nullopt;
931
932 auto PredOp = PredIntr->getOperand(0);
933 auto PredOpTy = cast<VectorType>(PredOp->getType());
934 if (PredOpTy != II.getType())
935 return std::nullopt;
936
937 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
938 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
939 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
940 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
941 if (BinOpOp1 == BinOpOp2)
942 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
943 else
944 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
945 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
946
947 auto NarrowedBinOp =
948 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
949 return IC.replaceInstUsesWith(II, NarrowedBinOp);
950}
951
952static std::optional<Instruction *>
954 // If the reinterpret instruction operand is a PHI Node
955 if (isa<PHINode>(II.getArgOperand(0)))
956 return processPhiNode(IC, II);
957
958 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
959 return BinOpCombine;
960
961 // Ignore converts to/from svcount_t.
962 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
963 isa<TargetExtType>(II.getType()))
964 return std::nullopt;
965
966 SmallVector<Instruction *, 32> CandidatesForRemoval;
967 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
968
969 const auto *IVTy = cast<VectorType>(II.getType());
970
971 // Walk the chain of conversions.
972 while (Cursor) {
973 // If the type of the cursor has fewer lanes than the final result, zeroing
974 // must take place, which breaks the equivalence chain.
975 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
976 if (CursorVTy->getElementCount().getKnownMinValue() <
977 IVTy->getElementCount().getKnownMinValue())
978 break;
979
980 // If the cursor has the same type as I, it is a viable replacement.
981 if (Cursor->getType() == IVTy)
982 EarliestReplacement = Cursor;
983
984 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
985
986 // If this is not an SVE conversion intrinsic, this is the end of the chain.
987 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
988 Intrinsic::aarch64_sve_convert_to_svbool ||
989 IntrinsicCursor->getIntrinsicID() ==
990 Intrinsic::aarch64_sve_convert_from_svbool))
991 break;
992
993 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
994 Cursor = IntrinsicCursor->getOperand(0);
995 }
996
997 // If no viable replacement in the conversion chain was found, there is
998 // nothing to do.
999 if (!EarliestReplacement)
1000 return std::nullopt;
1001
1002 return IC.replaceInstUsesWith(II, EarliestReplacement);
1003}
1004
1005static bool isAllActivePredicate(Value *Pred) {
1006 // Look through convert.from.svbool(convert.to.svbool(...) chain.
1007 Value *UncastedPred;
1008 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1009 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1010 m_Value(UncastedPred)))))
1011 // If the predicate has the same or less lanes than the uncasted
1012 // predicate then we know the casting has no effect.
1013 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1014 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1015 Pred = UncastedPred;
1016
1017 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1018 m_ConstantInt<AArch64SVEPredPattern::all>()));
1019}
1020
1021// Erase unary operation where predicate has all inactive lanes
1022static std::optional<Instruction *>
1024 int PredPos) {
1025 if (match(II.getOperand(PredPos), m_ZeroInt())) {
1026 return IC.eraseInstFromFunction(II);
1027 }
1028 return std::nullopt;
1029}
1030
1031// Simplify unary operation where predicate has all inactive lanes by replacing
1032// instruction with zeroed object
1033static std::optional<Instruction *>
1035 if (match(II.getOperand(0), m_ZeroInt())) {
1036 Constant *Node;
1037 Type *RetTy = II.getType();
1038 if (RetTy->isStructTy()) {
1039 auto StructT = cast<StructType>(RetTy);
1040 auto VecT = StructT->getElementType(0);
1042 for (unsigned i = 0; i < StructT->getNumElements(); i++) {
1043 ZerVec.push_back(VecT->isFPOrFPVectorTy() ? ConstantFP::get(VecT, 0.0)
1044 : ConstantInt::get(VecT, 0));
1045 }
1046 Node = ConstantStruct::get(StructT, ZerVec);
1047 } else if (RetTy->isFPOrFPVectorTy())
1048 Node = ConstantFP::get(RetTy, 0.0);
1049 else
1050 Node = ConstantInt::get(II.getType(), 0);
1051
1053 return IC.eraseInstFromFunction(II);
1054 }
1055 return std::nullopt;
1056}
1057
1058static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
1059 IntrinsicInst &II) {
1060 // svsel(ptrue, x, y) => x
1061 auto *OpPredicate = II.getOperand(0);
1062 if (isAllActivePredicate(OpPredicate))
1063 return IC.replaceInstUsesWith(II, II.getOperand(1));
1064
1065 auto Select =
1066 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
1067 return IC.replaceInstUsesWith(II, Select);
1068}
1069
1070static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
1071 IntrinsicInst &II) {
1072 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1073 if (!Pg)
1074 return std::nullopt;
1075
1076 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1077 return std::nullopt;
1078
1079 const auto PTruePattern =
1080 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1081 if (PTruePattern != AArch64SVEPredPattern::vl1)
1082 return std::nullopt;
1083
1084 // The intrinsic is inserting into lane zero so use an insert instead.
1085 auto *IdxTy = Type::getInt64Ty(II.getContext());
1086 auto *Insert = InsertElementInst::Create(
1087 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
1088 Insert->insertBefore(&II);
1089 Insert->takeName(&II);
1090
1091 return IC.replaceInstUsesWith(II, Insert);
1092}
1093
1094static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
1095 IntrinsicInst &II) {
1096 // Replace DupX with a regular IR splat.
1097 auto *RetTy = cast<ScalableVectorType>(II.getType());
1098 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
1099 II.getArgOperand(0));
1100 Splat->takeName(&II);
1101 return IC.replaceInstUsesWith(II, Splat);
1102}
1103
1104static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
1105 IntrinsicInst &II) {
1106 LLVMContext &Ctx = II.getContext();
1107
1108 // Check that the predicate is all active
1109 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
1110 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1111 return std::nullopt;
1112
1113 const auto PTruePattern =
1114 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1115 if (PTruePattern != AArch64SVEPredPattern::all)
1116 return std::nullopt;
1117
1118 // Check that we have a compare of zero..
1119 auto *SplatValue =
1120 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
1121 if (!SplatValue || !SplatValue->isZero())
1122 return std::nullopt;
1123
1124 // ..against a dupq
1125 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
1126 if (!DupQLane ||
1127 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1128 return std::nullopt;
1129
1130 // Where the dupq is a lane 0 replicate of a vector insert
1131 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
1132 return std::nullopt;
1133
1134 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1135 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1136 return std::nullopt;
1137
1138 // Where the vector insert is a fixed constant vector insert into undef at
1139 // index zero
1140 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1141 return std::nullopt;
1142
1143 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1144 return std::nullopt;
1145
1146 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1147 if (!ConstVec)
1148 return std::nullopt;
1149
1150 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1151 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
1152 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1153 return std::nullopt;
1154
1155 unsigned NumElts = VecTy->getNumElements();
1156 unsigned PredicateBits = 0;
1157
1158 // Expand intrinsic operands to a 16-bit byte level predicate
1159 for (unsigned I = 0; I < NumElts; ++I) {
1160 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
1161 if (!Arg)
1162 return std::nullopt;
1163 if (!Arg->isZero())
1164 PredicateBits |= 1 << (I * (16 / NumElts));
1165 }
1166
1167 // If all bits are zero bail early with an empty predicate
1168 if (PredicateBits == 0) {
1169 auto *PFalse = Constant::getNullValue(II.getType());
1170 PFalse->takeName(&II);
1171 return IC.replaceInstUsesWith(II, PFalse);
1172 }
1173
1174 // Calculate largest predicate type used (where byte predicate is largest)
1175 unsigned Mask = 8;
1176 for (unsigned I = 0; I < 16; ++I)
1177 if ((PredicateBits & (1 << I)) != 0)
1178 Mask |= (I % 8);
1179
1180 unsigned PredSize = Mask & -Mask;
1181 auto *PredType = ScalableVectorType::get(
1182 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
1183
1184 // Ensure all relevant bits are set
1185 for (unsigned I = 0; I < 16; I += PredSize)
1186 if ((PredicateBits & (1 << I)) == 0)
1187 return std::nullopt;
1188
1189 auto *PTruePat =
1190 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1191 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1192 {PredType}, {PTruePat});
1193 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
1194 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1195 auto *ConvertFromSVBool =
1196 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
1197 {II.getType()}, {ConvertToSVBool});
1198
1199 ConvertFromSVBool->takeName(&II);
1200 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
1201}
1202
1203static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
1204 IntrinsicInst &II) {
1205 Value *Pg = II.getArgOperand(0);
1206 Value *Vec = II.getArgOperand(1);
1207 auto IntrinsicID = II.getIntrinsicID();
1208 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1209
1210 // lastX(splat(X)) --> X
1211 if (auto *SplatVal = getSplatValue(Vec))
1212 return IC.replaceInstUsesWith(II, SplatVal);
1213
1214 // If x and/or y is a splat value then:
1215 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
1216 Value *LHS, *RHS;
1217 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
1218 if (isSplatValue(LHS) || isSplatValue(RHS)) {
1219 auto *OldBinOp = cast<BinaryOperator>(Vec);
1220 auto OpC = OldBinOp->getOpcode();
1221 auto *NewLHS =
1222 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
1223 auto *NewRHS =
1224 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
1226 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
1227 return IC.replaceInstUsesWith(II, NewBinOp);
1228 }
1229 }
1230
1231 auto *C = dyn_cast<Constant>(Pg);
1232 if (IsAfter && C && C->isNullValue()) {
1233 // The intrinsic is extracting lane 0 so use an extract instead.
1234 auto *IdxTy = Type::getInt64Ty(II.getContext());
1235 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
1236 Extract->insertBefore(&II);
1237 Extract->takeName(&II);
1238 return IC.replaceInstUsesWith(II, Extract);
1239 }
1240
1241 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1242 if (!IntrPG)
1243 return std::nullopt;
1244
1245 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1246 return std::nullopt;
1247
1248 const auto PTruePattern =
1249 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1250
1251 // Can the intrinsic's predicate be converted to a known constant index?
1252 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
1253 if (!MinNumElts)
1254 return std::nullopt;
1255
1256 unsigned Idx = MinNumElts - 1;
1257 // Increment the index if extracting the element after the last active
1258 // predicate element.
1259 if (IsAfter)
1260 ++Idx;
1261
1262 // Ignore extracts whose index is larger than the known minimum vector
1263 // length. NOTE: This is an artificial constraint where we prefer to
1264 // maintain what the user asked for until an alternative is proven faster.
1265 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
1266 if (Idx >= PgVTy->getMinNumElements())
1267 return std::nullopt;
1268
1269 // The intrinsic is extracting a fixed lane so use an extract instead.
1270 auto *IdxTy = Type::getInt64Ty(II.getContext());
1271 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
1272 Extract->insertBefore(&II);
1273 Extract->takeName(&II);
1274 return IC.replaceInstUsesWith(II, Extract);
1275}
1276
1277static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1278 IntrinsicInst &II) {
1279 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1280 // integer variant across a variety of micro-architectures. Replace scalar
1281 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1282 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1283 // depending on the micro-architecture, but has been observed as generally
1284 // being faster, particularly when the CLAST[AB] op is a loop-carried
1285 // dependency.
1286 Value *Pg = II.getArgOperand(0);
1287 Value *Fallback = II.getArgOperand(1);
1288 Value *Vec = II.getArgOperand(2);
1289 Type *Ty = II.getType();
1290
1291 if (!Ty->isIntegerTy())
1292 return std::nullopt;
1293
1294 Type *FPTy;
1295 switch (cast<IntegerType>(Ty)->getBitWidth()) {
1296 default:
1297 return std::nullopt;
1298 case 16:
1299 FPTy = IC.Builder.getHalfTy();
1300 break;
1301 case 32:
1302 FPTy = IC.Builder.getFloatTy();
1303 break;
1304 case 64:
1305 FPTy = IC.Builder.getDoubleTy();
1306 break;
1307 }
1308
1309 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
1310 auto *FPVTy = VectorType::get(
1311 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
1312 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
1313 auto *FPII = IC.Builder.CreateIntrinsic(
1314 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
1315 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
1316 return IC.replaceInstUsesWith(II, FPIItoInt);
1317}
1318
1319static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
1320 IntrinsicInst &II) {
1321 LLVMContext &Ctx = II.getContext();
1322 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
1323 // can work with RDFFR_PP for ptest elimination.
1324 auto *AllPat =
1325 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1326 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1327 {II.getType()}, {AllPat});
1328 auto *RDFFR =
1329 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
1330 RDFFR->takeName(&II);
1331 return IC.replaceInstUsesWith(II, RDFFR);
1332}
1333
1334static std::optional<Instruction *>
1336 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1337
1338 if (Pattern == AArch64SVEPredPattern::all) {
1339 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
1340 auto *VScale = IC.Builder.CreateVScale(StepVal);
1341 VScale->takeName(&II);
1342 return IC.replaceInstUsesWith(II, VScale);
1343 }
1344
1345 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
1346
1347 return MinNumElts && NumElts >= MinNumElts
1348 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
1349 II, ConstantInt::get(II.getType(), MinNumElts)))
1350 : std::nullopt;
1351}
1352
1353static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
1354 IntrinsicInst &II) {
1355 Value *PgVal = II.getArgOperand(0);
1356 Value *OpVal = II.getArgOperand(1);
1357
1358 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
1359 // Later optimizations prefer this form.
1360 if (PgVal == OpVal &&
1361 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
1362 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1363 Value *Ops[] = {PgVal, OpVal};
1364 Type *Tys[] = {PgVal->getType()};
1365
1366 auto *PTest =
1367 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1368 PTest->takeName(&II);
1369
1370 return IC.replaceInstUsesWith(II, PTest);
1371 }
1372
1373 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1374 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1375
1376 if (!Pg || !Op)
1377 return std::nullopt;
1378
1379 Intrinsic::ID OpIID = Op->getIntrinsicID();
1380
1381 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1382 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1383 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1384 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1385 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1386
1387 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1388
1389 PTest->takeName(&II);
1390 return IC.replaceInstUsesWith(II, PTest);
1391 }
1392
1393 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1394 // Later optimizations may rewrite sequence to use the flag-setting variant
1395 // of instruction X to remove PTEST.
1396 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1397 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1398 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1399 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1400 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1401 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1402 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1403 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1404 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1405 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1406 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1407 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1408 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1409 Value *Ops[] = {Pg->getArgOperand(0), Pg};
1410 Type *Tys[] = {Pg->getType()};
1411
1412 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1413 PTest->takeName(&II);
1414
1415 return IC.replaceInstUsesWith(II, PTest);
1416 }
1417
1418 return std::nullopt;
1419}
1420
1421template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1422static std::optional<Instruction *>
1424 bool MergeIntoAddendOp) {
1425 Value *P = II.getOperand(0);
1426 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1427 if (MergeIntoAddendOp) {
1428 AddendOp = II.getOperand(1);
1429 Mul = II.getOperand(2);
1430 } else {
1431 AddendOp = II.getOperand(2);
1432 Mul = II.getOperand(1);
1433 }
1434
1435 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1436 m_Value(MulOp1))))
1437 return std::nullopt;
1438
1439 if (!Mul->hasOneUse())
1440 return std::nullopt;
1441
1442 Instruction *FMFSource = nullptr;
1443 if (II.getType()->isFPOrFPVectorTy()) {
1444 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1445 // Stop the combine when the flags on the inputs differ in case dropping
1446 // flags would lead to us missing out on more beneficial optimizations.
1447 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1448 return std::nullopt;
1449 if (!FAddFlags.allowContract())
1450 return std::nullopt;
1451 FMFSource = &II;
1452 }
1453
1454 CallInst *Res;
1455 if (MergeIntoAddendOp)
1456 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1457 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1458 else
1459 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1460 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1461
1462 return IC.replaceInstUsesWith(II, Res);
1463}
1464
1465static std::optional<Instruction *>
1467 Value *Pred = II.getOperand(0);
1468 Value *PtrOp = II.getOperand(1);
1469 Type *VecTy = II.getType();
1470
1471 // Replace by zero constant when all lanes are inactive
1472 if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
1473 return II_NA;
1474
1475 if (isAllActivePredicate(Pred)) {
1476 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
1477 Load->copyMetadata(II);
1478 return IC.replaceInstUsesWith(II, Load);
1479 }
1480
1481 CallInst *MaskedLoad =
1482 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
1483 Pred, ConstantAggregateZero::get(VecTy));
1484 MaskedLoad->copyMetadata(II);
1485 return IC.replaceInstUsesWith(II, MaskedLoad);
1486}
1487
1488static std::optional<Instruction *>
1490 Value *VecOp = II.getOperand(0);
1491 Value *Pred = II.getOperand(1);
1492 Value *PtrOp = II.getOperand(2);
1493
1494 if (isAllActivePredicate(Pred)) {
1495 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
1496 Store->copyMetadata(II);
1497 return IC.eraseInstFromFunction(II);
1498 }
1499
1500 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
1501 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
1502 MaskedStore->copyMetadata(II);
1503 return IC.eraseInstFromFunction(II);
1504}
1505
1507 switch (Intrinsic) {
1508 case Intrinsic::aarch64_sve_fmul_u:
1509 return Instruction::BinaryOps::FMul;
1510 case Intrinsic::aarch64_sve_fadd_u:
1511 return Instruction::BinaryOps::FAdd;
1512 case Intrinsic::aarch64_sve_fsub_u:
1513 return Instruction::BinaryOps::FSub;
1514 default:
1515 return Instruction::BinaryOpsEnd;
1516 }
1517}
1518
1519static std::optional<Instruction *>
1521 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
1522 if (II.isStrictFP())
1523 return std::nullopt;
1524
1525 auto *OpPredicate = II.getOperand(0);
1526 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1527 if (BinOpCode == Instruction::BinaryOpsEnd ||
1528 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1529 m_ConstantInt<AArch64SVEPredPattern::all>())))
1530 return std::nullopt;
1532 IC.Builder.setFastMathFlags(II.getFastMathFlags());
1533 auto BinOp =
1534 IC.Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
1535 return IC.replaceInstUsesWith(II, BinOp);
1536}
1537
1538// Canonicalise operations that take an all active predicate (e.g. sve.add ->
1539// sve.add_u).
1540static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
1541 Intrinsic::ID IID) {
1542 auto *OpPredicate = II.getOperand(0);
1543 if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1544 m_ConstantInt<AArch64SVEPredPattern::all>())))
1545 return std::nullopt;
1546
1547 auto *Mod = II.getModule();
1548 auto *NewDecl = Intrinsic::getDeclaration(Mod, IID, {II.getType()});
1549 II.setCalledFunction(NewDecl);
1550
1551 return &II;
1552}
1553
1554// Simplify operations where predicate has all inactive lanes or try to replace
1555// with _u form when all lanes are active
1556static std::optional<Instruction *>
1558 Intrinsic::ID IID) {
1559 if (match(II.getOperand(0), m_ZeroInt())) {
1560 // llvm_ir, pred(0), op1, op2 - Spec says to return op1 when all lanes are
1561 // inactive for sv[func]_m
1562 return IC.replaceInstUsesWith(II, II.getOperand(1));
1563 }
1564 return instCombineSVEAllActive(II, IID);
1565}
1566
1567static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1568 IntrinsicInst &II) {
1569 if (auto II_U =
1570 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_add_u))
1571 return II_U;
1572 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1573 Intrinsic::aarch64_sve_mla>(
1574 IC, II, true))
1575 return MLA;
1576 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1577 Intrinsic::aarch64_sve_mad>(
1578 IC, II, false))
1579 return MAD;
1580 return std::nullopt;
1581}
1582
1583static std::optional<Instruction *>
1585 if (auto II_U =
1586 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fadd_u))
1587 return II_U;
1588 if (auto FMLA =
1589 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1590 Intrinsic::aarch64_sve_fmla>(IC, II,
1591 true))
1592 return FMLA;
1593 if (auto FMAD =
1594 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1595 Intrinsic::aarch64_sve_fmad>(IC, II,
1596 false))
1597 return FMAD;
1598 if (auto FMLA =
1599 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1600 Intrinsic::aarch64_sve_fmla>(IC, II,
1601 true))
1602 return FMLA;
1603 return std::nullopt;
1604}
1605
1606static std::optional<Instruction *>
1608 if (auto FMLA =
1609 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1610 Intrinsic::aarch64_sve_fmla>(IC, II,
1611 true))
1612 return FMLA;
1613 if (auto FMAD =
1614 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1615 Intrinsic::aarch64_sve_fmad>(IC, II,
1616 false))
1617 return FMAD;
1618 if (auto FMLA_U =
1619 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1620 Intrinsic::aarch64_sve_fmla_u>(
1621 IC, II, true))
1622 return FMLA_U;
1623 return instCombineSVEVectorBinOp(IC, II);
1624}
1625
1626static std::optional<Instruction *>
1628 if (auto II_U =
1629 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fsub_u))
1630 return II_U;
1631 if (auto FMLS =
1632 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1633 Intrinsic::aarch64_sve_fmls>(IC, II,
1634 true))
1635 return FMLS;
1636 if (auto FMSB =
1637 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1638 Intrinsic::aarch64_sve_fnmsb>(
1639 IC, II, false))
1640 return FMSB;
1641 if (auto FMLS =
1642 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1643 Intrinsic::aarch64_sve_fmls>(IC, II,
1644 true))
1645 return FMLS;
1646 return std::nullopt;
1647}
1648
1649static std::optional<Instruction *>
1651 if (auto FMLS =
1652 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1653 Intrinsic::aarch64_sve_fmls>(IC, II,
1654 true))
1655 return FMLS;
1656 if (auto FMSB =
1657 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1658 Intrinsic::aarch64_sve_fnmsb>(
1659 IC, II, false))
1660 return FMSB;
1661 if (auto FMLS_U =
1662 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1663 Intrinsic::aarch64_sve_fmls_u>(
1664 IC, II, true))
1665 return FMLS_U;
1666 return instCombineSVEVectorBinOp(IC, II);
1667}
1668
1669static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1670 IntrinsicInst &II) {
1671 if (auto II_U =
1672 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sub_u))
1673 return II_U;
1674 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1675 Intrinsic::aarch64_sve_mls>(
1676 IC, II, true))
1677 return MLS;
1678 return std::nullopt;
1679}
1680
1681static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1683 Intrinsic::ID IID) {
1684 auto *OpPredicate = II.getOperand(0);
1685 auto *OpMultiplicand = II.getOperand(1);
1686 auto *OpMultiplier = II.getOperand(2);
1687
1688 // Return true if a given instruction is a unit splat value, false otherwise.
1689 auto IsUnitSplat = [](auto *I) {
1690 auto *SplatValue = getSplatValue(I);
1691 if (!SplatValue)
1692 return false;
1693 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1694 };
1695
1696 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1697 // with a unit splat value, false otherwise.
1698 auto IsUnitDup = [](auto *I) {
1699 auto *IntrI = dyn_cast<IntrinsicInst>(I);
1700 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1701 return false;
1702
1703 auto *SplatValue = IntrI->getOperand(2);
1704 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1705 };
1706
1707 if (IsUnitSplat(OpMultiplier)) {
1708 // [f]mul pg %n, (dupx 1) => %n
1709 OpMultiplicand->takeName(&II);
1710 return IC.replaceInstUsesWith(II, OpMultiplicand);
1711 } else if (IsUnitDup(OpMultiplier)) {
1712 // [f]mul pg %n, (dup pg 1) => %n
1713 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1714 auto *DupPg = DupInst->getOperand(1);
1715 // TODO: this is naive. The optimization is still valid if DupPg
1716 // 'encompasses' OpPredicate, not only if they're the same predicate.
1717 if (OpPredicate == DupPg) {
1718 OpMultiplicand->takeName(&II);
1719 return IC.replaceInstUsesWith(II, OpMultiplicand);
1720 }
1721 }
1722
1723 return instCombineSVEVectorBinOp(IC, II);
1724}
1725
1726static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1727 IntrinsicInst &II) {
1728 Value *UnpackArg = II.getArgOperand(0);
1729 auto *RetTy = cast<ScalableVectorType>(II.getType());
1730 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1731 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1732
1733 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1734 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1735 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1736 ScalarArg =
1737 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1738 Value *NewVal =
1739 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1740 NewVal->takeName(&II);
1741 return IC.replaceInstUsesWith(II, NewVal);
1742 }
1743
1744 return std::nullopt;
1745}
1746static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1747 IntrinsicInst &II) {
1748 auto *OpVal = II.getOperand(0);
1749 auto *OpIndices = II.getOperand(1);
1750 VectorType *VTy = cast<VectorType>(II.getType());
1751
1752 // Check whether OpIndices is a constant splat value < minimal element count
1753 // of result.
1754 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1755 if (!SplatValue ||
1756 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1757 return std::nullopt;
1758
1759 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1760 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1761 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
1762 auto *VectorSplat =
1763 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1764
1765 VectorSplat->takeName(&II);
1766 return IC.replaceInstUsesWith(II, VectorSplat);
1767}
1768
1769static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
1770 IntrinsicInst &II) {
1771 Value *A, *B;
1772 Type *RetTy = II.getType();
1773 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
1774 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
1775
1776 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
1777 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
1778 if ((match(II.getArgOperand(0),
1779 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(A)))) &&
1780 match(II.getArgOperand(1),
1781 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(m_Value(B))))) ||
1782 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
1783 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
1784 auto *TyA = cast<ScalableVectorType>(A->getType());
1785 if (TyA == B->getType() &&
1787 auto *SubVec = IC.Builder.CreateInsertVector(
1789 auto *ConcatVec = IC.Builder.CreateInsertVector(
1790 RetTy, SubVec, B, IC.Builder.getInt64(TyA->getMinNumElements()));
1791 ConcatVec->takeName(&II);
1792 return IC.replaceInstUsesWith(II, ConcatVec);
1793 }
1794 }
1795
1796 return std::nullopt;
1797}
1798
1799static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1800 IntrinsicInst &II) {
1801 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1802 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1803 Value *A, *B;
1804 if (match(II.getArgOperand(0),
1805 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1806 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1807 m_Specific(A), m_Specific(B))))
1808 return IC.replaceInstUsesWith(
1809 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1810
1811 return std::nullopt;
1812}
1813
1814static std::optional<Instruction *>
1816 Value *Mask = II.getOperand(0);
1817 Value *BasePtr = II.getOperand(1);
1818 Value *Index = II.getOperand(2);
1819 Type *Ty = II.getType();
1820 Value *PassThru = ConstantAggregateZero::get(Ty);
1821
1822 // Replace by zero constant when all lanes are inactive
1823 if (auto II_NA = instCombineSVENoActiveUnaryZero(IC, II))
1824 return II_NA;
1825
1826 // Contiguous gather => masked load.
1827 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1828 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1829 Value *IndexBase;
1830 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1831 m_Value(IndexBase), m_SpecificInt(1)))) {
1832 Align Alignment =
1833 BasePtr->getPointerAlignment(II.getDataLayout());
1834
1835 Type *VecPtrTy = PointerType::getUnqual(Ty);
1836 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1837 BasePtr, IndexBase);
1838 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1839 CallInst *MaskedLoad =
1840 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1841 MaskedLoad->takeName(&II);
1842 return IC.replaceInstUsesWith(II, MaskedLoad);
1843 }
1844
1845 return std::nullopt;
1846}
1847
1848static std::optional<Instruction *>
1850 Value *Val = II.getOperand(0);
1851 Value *Mask = II.getOperand(1);
1852 Value *BasePtr = II.getOperand(2);
1853 Value *Index = II.getOperand(3);
1854 Type *Ty = Val->getType();
1855
1856 // Contiguous scatter => masked store.
1857 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1858 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1859 Value *IndexBase;
1860 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1861 m_Value(IndexBase), m_SpecificInt(1)))) {
1862 Align Alignment =
1863 BasePtr->getPointerAlignment(II.getDataLayout());
1864
1865 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1866 BasePtr, IndexBase);
1867 Type *VecPtrTy = PointerType::getUnqual(Ty);
1868 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1869
1870 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1871
1872 return IC.eraseInstFromFunction(II);
1873 }
1874
1875 return std::nullopt;
1876}
1877
1878static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1879 IntrinsicInst &II) {
1880 Type *Int32Ty = IC.Builder.getInt32Ty();
1881 Value *Pred = II.getOperand(0);
1882 Value *Vec = II.getOperand(1);
1883 Value *DivVec = II.getOperand(2);
1884
1885 Value *SplatValue = getSplatValue(DivVec);
1886 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1887 if (!SplatConstantInt)
1888 return std::nullopt;
1889 APInt Divisor = SplatConstantInt->getValue();
1890
1891 if (Divisor.isPowerOf2()) {
1892 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1893 auto ASRD = IC.Builder.CreateIntrinsic(
1894 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1895 return IC.replaceInstUsesWith(II, ASRD);
1896 }
1897 if (Divisor.isNegatedPowerOf2()) {
1898 Divisor.negate();
1899 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1900 auto ASRD = IC.Builder.CreateIntrinsic(
1901 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1902 auto NEG = IC.Builder.CreateIntrinsic(
1903 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
1904 return IC.replaceInstUsesWith(II, NEG);
1905 }
1906
1907 return std::nullopt;
1908}
1909
1910bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
1911 size_t VecSize = Vec.size();
1912 if (VecSize == 1)
1913 return true;
1914 if (!isPowerOf2_64(VecSize))
1915 return false;
1916 size_t HalfVecSize = VecSize / 2;
1917
1918 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
1919 RHS != Vec.end(); LHS++, RHS++) {
1920 if (*LHS != nullptr && *RHS != nullptr) {
1921 if (*LHS == *RHS)
1922 continue;
1923 else
1924 return false;
1925 }
1926 if (!AllowPoison)
1927 return false;
1928 if (*LHS == nullptr && *RHS != nullptr)
1929 *LHS = *RHS;
1930 }
1931
1932 Vec.resize(HalfVecSize);
1933 SimplifyValuePattern(Vec, AllowPoison);
1934 return true;
1935}
1936
1937// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
1938// to dupqlane(f64(C)) where C is A concatenated with B
1939static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
1940 IntrinsicInst &II) {
1941 Value *CurrentInsertElt = nullptr, *Default = nullptr;
1942 if (!match(II.getOperand(0),
1943 m_Intrinsic<Intrinsic::vector_insert>(
1944 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
1945 !isa<FixedVectorType>(CurrentInsertElt->getType()))
1946 return std::nullopt;
1947 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
1948
1949 // Insert the scalars into a container ordered by InsertElement index
1950 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
1951 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
1952 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
1953 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
1954 CurrentInsertElt = InsertElt->getOperand(0);
1955 }
1956
1957 bool AllowPoison =
1958 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
1959 if (!SimplifyValuePattern(Elts, AllowPoison))
1960 return std::nullopt;
1961
1962 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
1963 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
1964 for (size_t I = 0; I < Elts.size(); I++) {
1965 if (Elts[I] == nullptr)
1966 continue;
1967 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
1968 IC.Builder.getInt64(I));
1969 }
1970 if (InsertEltChain == nullptr)
1971 return std::nullopt;
1972
1973 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
1974 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
1975 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
1976 // be narrowed back to the original type.
1977 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
1978 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
1979 IIScalableTy->getMinNumElements() /
1980 PatternWidth;
1981
1982 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
1983 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
1984 auto *WideShuffleMaskTy =
1985 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
1986
1987 auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0));
1988 auto InsertSubvector = IC.Builder.CreateInsertVector(
1989 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
1990 auto WideBitcast =
1991 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
1992 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
1993 auto WideShuffle = IC.Builder.CreateShuffleVector(
1994 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
1995 auto NarrowBitcast =
1996 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
1997
1998 return IC.replaceInstUsesWith(II, NarrowBitcast);
1999}
2000
2001static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
2002 IntrinsicInst &II) {
2003 Value *A = II.getArgOperand(0);
2004 Value *B = II.getArgOperand(1);
2005 if (A == B)
2006 return IC.replaceInstUsesWith(II, A);
2007
2008 return std::nullopt;
2009}
2010
2011static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
2012 IntrinsicInst &II) {
2013 Value *Pred = II.getOperand(0);
2014 Value *Vec = II.getOperand(1);
2015 Value *Shift = II.getOperand(2);
2016
2017 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
2018 Value *AbsPred, *MergedValue;
2019 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
2020 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
2021 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
2022 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
2023
2024 return std::nullopt;
2025
2026 // Transform is valid if any of the following are true:
2027 // * The ABS merge value is an undef or non-negative
2028 // * The ABS predicate is all active
2029 // * The ABS predicate and the SRSHL predicates are the same
2030 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
2031 AbsPred != Pred && !isAllActivePredicate(AbsPred))
2032 return std::nullopt;
2033
2034 // Only valid when the shift amount is non-negative, otherwise the rounding
2035 // behaviour of SRSHL cannot be ignored.
2036 if (!match(Shift, m_NonNegative()))
2037 return std::nullopt;
2038
2039 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
2040 {II.getType()}, {Pred, Vec, Shift});
2041
2042 return IC.replaceInstUsesWith(II, LSL);
2043}
2044
2045std::optional<Instruction *>
2047 IntrinsicInst &II) const {
2048 Intrinsic::ID IID = II.getIntrinsicID();
2049 switch (IID) {
2050 default:
2051 break;
2052
2053 case Intrinsic::aarch64_sve_st1_scatter:
2054 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
2055 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
2056 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
2057 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
2058 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
2059 case Intrinsic::aarch64_sve_st1dq:
2060 case Intrinsic::aarch64_sve_st1q_scatter_index:
2061 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
2062 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
2063 case Intrinsic::aarch64_sve_st1wq:
2064 case Intrinsic::aarch64_sve_stnt1:
2065 case Intrinsic::aarch64_sve_stnt1_scatter:
2066 case Intrinsic::aarch64_sve_stnt1_scatter_index:
2067 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
2068 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
2069 return instCombineSVENoActiveUnaryErase(IC, II, 1);
2070 case Intrinsic::aarch64_sve_st2:
2071 case Intrinsic::aarch64_sve_st2q:
2072 return instCombineSVENoActiveUnaryErase(IC, II, 2);
2073 case Intrinsic::aarch64_sve_st3:
2074 case Intrinsic::aarch64_sve_st3q:
2075 return instCombineSVENoActiveUnaryErase(IC, II, 3);
2076 case Intrinsic::aarch64_sve_st4:
2077 case Intrinsic::aarch64_sve_st4q:
2078 return instCombineSVENoActiveUnaryErase(IC, II, 4);
2079 case Intrinsic::aarch64_sve_ld1_gather:
2080 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
2081 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
2082 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
2083 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
2084 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
2085 case Intrinsic::aarch64_sve_ld1q_gather_index:
2086 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
2087 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
2088 case Intrinsic::aarch64_sve_ld1ro:
2089 case Intrinsic::aarch64_sve_ld1rq:
2090 case Intrinsic::aarch64_sve_ld1udq:
2091 case Intrinsic::aarch64_sve_ld1uwq:
2092 case Intrinsic::aarch64_sve_ld2_sret:
2093 case Intrinsic::aarch64_sve_ld2q_sret:
2094 case Intrinsic::aarch64_sve_ld3_sret:
2095 case Intrinsic::aarch64_sve_ld3q_sret:
2096 case Intrinsic::aarch64_sve_ld4_sret:
2097 case Intrinsic::aarch64_sve_ld4q_sret:
2098 case Intrinsic::aarch64_sve_ldff1:
2099 case Intrinsic::aarch64_sve_ldff1_gather:
2100 case Intrinsic::aarch64_sve_ldff1_gather_index:
2101 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
2102 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
2103 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
2104 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
2105 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
2106 case Intrinsic::aarch64_sve_ldnf1:
2107 case Intrinsic::aarch64_sve_ldnt1:
2108 case Intrinsic::aarch64_sve_ldnt1_gather:
2109 case Intrinsic::aarch64_sve_ldnt1_gather_index:
2110 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
2111 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
2113 case Intrinsic::aarch64_neon_fmaxnm:
2114 case Intrinsic::aarch64_neon_fminnm:
2115 return instCombineMaxMinNM(IC, II);
2116 case Intrinsic::aarch64_sve_convert_from_svbool:
2117 return instCombineConvertFromSVBool(IC, II);
2118 case Intrinsic::aarch64_sve_dup:
2119 return instCombineSVEDup(IC, II);
2120 case Intrinsic::aarch64_sve_dup_x:
2121 return instCombineSVEDupX(IC, II);
2122 case Intrinsic::aarch64_sve_cmpne:
2123 case Intrinsic::aarch64_sve_cmpne_wide:
2124 return instCombineSVECmpNE(IC, II);
2125 case Intrinsic::aarch64_sve_rdffr:
2126 return instCombineRDFFR(IC, II);
2127 case Intrinsic::aarch64_sve_lasta:
2128 case Intrinsic::aarch64_sve_lastb:
2129 return instCombineSVELast(IC, II);
2130 case Intrinsic::aarch64_sve_clasta_n:
2131 case Intrinsic::aarch64_sve_clastb_n:
2132 return instCombineSVECondLast(IC, II);
2133 case Intrinsic::aarch64_sve_cntd:
2134 return instCombineSVECntElts(IC, II, 2);
2135 case Intrinsic::aarch64_sve_cntw:
2136 return instCombineSVECntElts(IC, II, 4);
2137 case Intrinsic::aarch64_sve_cnth:
2138 return instCombineSVECntElts(IC, II, 8);
2139 case Intrinsic::aarch64_sve_cntb:
2140 return instCombineSVECntElts(IC, II, 16);
2141 case Intrinsic::aarch64_sve_ptest_any:
2142 case Intrinsic::aarch64_sve_ptest_first:
2143 case Intrinsic::aarch64_sve_ptest_last:
2144 return instCombineSVEPTest(IC, II);
2145 case Intrinsic::aarch64_sve_fabd:
2146 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fabd_u);
2147 case Intrinsic::aarch64_sve_fadd:
2148 return instCombineSVEVectorFAdd(IC, II);
2149 case Intrinsic::aarch64_sve_fadd_u:
2150 return instCombineSVEVectorFAddU(IC, II);
2151 case Intrinsic::aarch64_sve_fdiv:
2152 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fdiv_u);
2153 case Intrinsic::aarch64_sve_fmax:
2154 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmax_u);
2155 case Intrinsic::aarch64_sve_fmaxnm:
2156 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmaxnm_u);
2157 case Intrinsic::aarch64_sve_fmin:
2158 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmin_u);
2159 case Intrinsic::aarch64_sve_fminnm:
2160 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fminnm_u);
2161 case Intrinsic::aarch64_sve_fmla:
2162 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmla_u);
2163 case Intrinsic::aarch64_sve_fmls:
2164 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmls_u);
2165 case Intrinsic::aarch64_sve_fmul:
2166 if (auto II_U =
2167 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmul_u))
2168 return II_U;
2169 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2170 case Intrinsic::aarch64_sve_fmul_u:
2171 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
2172 case Intrinsic::aarch64_sve_fmulx:
2173 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fmulx_u);
2174 case Intrinsic::aarch64_sve_fnmla:
2175 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmla_u);
2176 case Intrinsic::aarch64_sve_fnmls:
2177 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_fnmls_u);
2178 case Intrinsic::aarch64_sve_fsub:
2179 return instCombineSVEVectorFSub(IC, II);
2180 case Intrinsic::aarch64_sve_fsub_u:
2181 return instCombineSVEVectorFSubU(IC, II);
2182 case Intrinsic::aarch64_sve_add:
2183 return instCombineSVEVectorAdd(IC, II);
2184 case Intrinsic::aarch64_sve_add_u:
2185 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2186 Intrinsic::aarch64_sve_mla_u>(
2187 IC, II, true);
2188 case Intrinsic::aarch64_sve_mla:
2189 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mla_u);
2190 case Intrinsic::aarch64_sve_mls:
2191 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mls_u);
2192 case Intrinsic::aarch64_sve_mul:
2193 if (auto II_U =
2194 instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_mul_u))
2195 return II_U;
2196 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2197 case Intrinsic::aarch64_sve_mul_u:
2198 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
2199 case Intrinsic::aarch64_sve_sabd:
2200 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sabd_u);
2201 case Intrinsic::aarch64_sve_smax:
2202 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smax_u);
2203 case Intrinsic::aarch64_sve_smin:
2204 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smin_u);
2205 case Intrinsic::aarch64_sve_smulh:
2206 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_smulh_u);
2207 case Intrinsic::aarch64_sve_sub:
2208 return instCombineSVEVectorSub(IC, II);
2209 case Intrinsic::aarch64_sve_sub_u:
2210 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
2211 Intrinsic::aarch64_sve_mls_u>(
2212 IC, II, true);
2213 case Intrinsic::aarch64_sve_uabd:
2214 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uabd_u);
2215 case Intrinsic::aarch64_sve_umax:
2216 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umax_u);
2217 case Intrinsic::aarch64_sve_umin:
2218 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umin_u);
2219 case Intrinsic::aarch64_sve_umulh:
2220 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_umulh_u);
2221 case Intrinsic::aarch64_sve_asr:
2222 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_asr_u);
2223 case Intrinsic::aarch64_sve_lsl:
2224 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsl_u);
2225 case Intrinsic::aarch64_sve_lsr:
2226 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_lsr_u);
2227 case Intrinsic::aarch64_sve_and:
2228 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_and_u);
2229 case Intrinsic::aarch64_sve_bic:
2230 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_bic_u);
2231 case Intrinsic::aarch64_sve_eor:
2232 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_eor_u);
2233 case Intrinsic::aarch64_sve_orr:
2234 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_orr_u);
2235 case Intrinsic::aarch64_sve_sqsub:
2236 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_sqsub_u);
2237 case Intrinsic::aarch64_sve_uqsub:
2238 return instCombineSVEAllOrNoActive(IC, II, Intrinsic::aarch64_sve_uqsub_u);
2239 case Intrinsic::aarch64_sve_tbl:
2240 return instCombineSVETBL(IC, II);
2241 case Intrinsic::aarch64_sve_uunpkhi:
2242 case Intrinsic::aarch64_sve_uunpklo:
2243 case Intrinsic::aarch64_sve_sunpkhi:
2244 case Intrinsic::aarch64_sve_sunpklo:
2245 return instCombineSVEUnpack(IC, II);
2246 case Intrinsic::aarch64_sve_uzp1:
2247 return instCombineSVEUzp1(IC, II);
2248 case Intrinsic::aarch64_sve_zip1:
2249 case Intrinsic::aarch64_sve_zip2:
2250 return instCombineSVEZip(IC, II);
2251 case Intrinsic::aarch64_sve_ld1_gather_index:
2252 return instCombineLD1GatherIndex(IC, II);
2253 case Intrinsic::aarch64_sve_st1_scatter_index:
2254 return instCombineST1ScatterIndex(IC, II);
2255 case Intrinsic::aarch64_sve_ld1:
2256 return instCombineSVELD1(IC, II, DL);
2257 case Intrinsic::aarch64_sve_st1:
2258 return instCombineSVEST1(IC, II, DL);
2259 case Intrinsic::aarch64_sve_sdiv:
2260 return instCombineSVESDIV(IC, II);
2261 case Intrinsic::aarch64_sve_sel:
2262 return instCombineSVESel(IC, II);
2263 case Intrinsic::aarch64_sve_srshl:
2264 return instCombineSVESrshl(IC, II);
2265 case Intrinsic::aarch64_sve_dupq_lane:
2266 return instCombineSVEDupqLane(IC, II);
2267 }
2268
2269 return std::nullopt;
2270}
2271
2273 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
2274 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
2275 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2276 SimplifyAndSetOp) const {
2277 switch (II.getIntrinsicID()) {
2278 default:
2279 break;
2280 case Intrinsic::aarch64_neon_fcvtxn:
2281 case Intrinsic::aarch64_neon_rshrn:
2282 case Intrinsic::aarch64_neon_sqrshrn:
2283 case Intrinsic::aarch64_neon_sqrshrun:
2284 case Intrinsic::aarch64_neon_sqshrn:
2285 case Intrinsic::aarch64_neon_sqshrun:
2286 case Intrinsic::aarch64_neon_sqxtn:
2287 case Intrinsic::aarch64_neon_sqxtun:
2288 case Intrinsic::aarch64_neon_uqrshrn:
2289 case Intrinsic::aarch64_neon_uqshrn:
2290 case Intrinsic::aarch64_neon_uqxtn:
2291 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2292 break;
2293 }
2294
2295 return std::nullopt;
2296}
2297
2300 switch (K) {
2302 return TypeSize::getFixed(64);
2304 if (ST->useSVEForFixedLengthVectors() &&
2306 return TypeSize::getFixed(
2307 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
2308 else if (ST->isNeonAvailable())
2309 return TypeSize::getFixed(128);
2310 else
2311 return TypeSize::getFixed(0);
2313 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
2315 return TypeSize::getScalable(128);
2316 else
2317 return TypeSize::getScalable(0);
2318 }
2319 llvm_unreachable("Unsupported register kind");
2320}
2321
2322bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
2324 Type *SrcOverrideTy) {
2325 // A helper that returns a vector type from the given type. The number of
2326 // elements in type Ty determines the vector width.
2327 auto toVectorTy = [&](Type *ArgTy) {
2328 return VectorType::get(ArgTy->getScalarType(),
2329 cast<VectorType>(DstTy)->getElementCount());
2330 };
2331
2332 // Exit early if DstTy is not a vector type whose elements are one of [i16,
2333 // i32, i64]. SVE doesn't generally have the same set of instructions to
2334 // perform an extend with the add/sub/mul. There are SMULLB style
2335 // instructions, but they operate on top/bottom, requiring some sort of lane
2336 // interleaving to be used with zext/sext.
2337 unsigned DstEltSize = DstTy->getScalarSizeInBits();
2338 if (!useNeonVector(DstTy) || Args.size() != 2 ||
2339 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2340 return false;
2341
2342 // Determine if the operation has a widening variant. We consider both the
2343 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
2344 // instructions.
2345 //
2346 // TODO: Add additional widening operations (e.g., shl, etc.) once we
2347 // verify that their extending operands are eliminated during code
2348 // generation.
2349 Type *SrcTy = SrcOverrideTy;
2350 switch (Opcode) {
2351 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
2352 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
2353 // The second operand needs to be an extend
2354 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
2355 if (!SrcTy)
2356 SrcTy =
2357 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
2358 } else
2359 return false;
2360 break;
2361 case Instruction::Mul: { // SMULL(2), UMULL(2)
2362 // Both operands need to be extends of the same type.
2363 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
2364 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
2365 if (!SrcTy)
2366 SrcTy =
2367 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
2368 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
2369 // If one of the operands is a Zext and the other has enough zero bits to
2370 // be treated as unsigned, we can still general a umull, meaning the zext
2371 // is free.
2372 KnownBits Known =
2373 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
2374 if (Args[0]->getType()->getScalarSizeInBits() -
2375 Known.Zero.countLeadingOnes() >
2376 DstTy->getScalarSizeInBits() / 2)
2377 return false;
2378 if (!SrcTy)
2379 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
2380 DstTy->getScalarSizeInBits() / 2));
2381 } else
2382 return false;
2383 break;
2384 }
2385 default:
2386 return false;
2387 }
2388
2389 // Legalize the destination type and ensure it can be used in a widening
2390 // operation.
2391 auto DstTyL = getTypeLegalizationCost(DstTy);
2392 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
2393 return false;
2394
2395 // Legalize the source type and ensure it can be used in a widening
2396 // operation.
2397 assert(SrcTy && "Expected some SrcTy");
2398 auto SrcTyL = getTypeLegalizationCost(SrcTy);
2399 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2400 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
2401 return false;
2402
2403 // Get the total number of vector elements in the legalized types.
2404 InstructionCost NumDstEls =
2405 DstTyL.first * DstTyL.second.getVectorMinNumElements();
2406 InstructionCost NumSrcEls =
2407 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2408
2409 // Return true if the legalized types have the same number of vector elements
2410 // and the destination element type size is twice that of the source type.
2411 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
2412}
2413
2414// s/urhadd instructions implement the following pattern, making the
2415// extends free:
2416// %x = add ((zext i8 -> i16), 1)
2417// %y = (zext i8 -> i16)
2418// trunc i16 (lshr (add %x, %y), 1) -> i8
2419//
2421 Type *Src) {
2422 // The source should be a legal vector type.
2423 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
2424 (Src->isScalableTy() && !ST->hasSVE2()))
2425 return false;
2426
2427 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
2428 return false;
2429
2430 // Look for trunc/shl/add before trying to match the pattern.
2431 const Instruction *Add = ExtUser;
2432 auto *AddUser =
2433 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2434 if (AddUser && AddUser->getOpcode() == Instruction::Add)
2435 Add = AddUser;
2436
2437 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2438 if (!Shr || Shr->getOpcode() != Instruction::LShr)
2439 return false;
2440
2441 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2442 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2443 Src->getScalarSizeInBits() !=
2444 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2445 return false;
2446
2447 // Try to match the whole pattern. Ext could be either the first or second
2448 // m_ZExtOrSExt matched.
2449 Instruction *Ex1, *Ex2;
2450 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
2451 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
2452 return false;
2453
2454 // Ensure both extends are of the same type
2455 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
2456 Ex1->getOpcode() == Ex2->getOpcode())
2457 return true;
2458
2459 return false;
2460}
2461
2463 Type *Src,
2466 const Instruction *I) {
2467 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2468 assert(ISD && "Invalid opcode");
2469 // If the cast is observable, and it is used by a widening instruction (e.g.,
2470 // uaddl, saddw, etc.), it may be free.
2471 if (I && I->hasOneUser()) {
2472 auto *SingleUser = cast<Instruction>(*I->user_begin());
2473 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
2474 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
2475 // For adds only count the second operand as free if both operands are
2476 // extends but not the same operation. (i.e both operands are not free in
2477 // add(sext, zext)).
2478 if (SingleUser->getOpcode() == Instruction::Add) {
2479 if (I == SingleUser->getOperand(1) ||
2480 (isa<CastInst>(SingleUser->getOperand(1)) &&
2481 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
2482 return 0;
2483 } else // Others are free so long as isWideningInstruction returned true.
2484 return 0;
2485 }
2486
2487 // The cast will be free for the s/urhadd instructions
2488 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
2489 isExtPartOfAvgExpr(SingleUser, Dst, Src))
2490 return 0;
2491 }
2492
2493 // TODO: Allow non-throughput costs that aren't binary.
2494 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2496 return Cost == 0 ? 0 : 1;
2497 return Cost;
2498 };
2499
2500 EVT SrcTy = TLI->getValueType(DL, Src);
2501 EVT DstTy = TLI->getValueType(DL, Dst);
2502
2503 if (!SrcTy.isSimple() || !DstTy.isSimple())
2504 return AdjustCost(
2505 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2506
2507 static const TypeConversionCostTblEntry
2508 ConversionTbl[] = {
2509 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
2510 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
2511 { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
2512 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
2513 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
2514 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
2515 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
2516 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
2517 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
2518 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
2519 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
2520 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
2521 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
2522 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
2523 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
2524 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
2525 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
2526 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
2527 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
2528 { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
2529
2530 // Truncations on nxvmiN
2531 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
2532 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
2533 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
2534 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
2535 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
2536 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
2537 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
2538 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
2539 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
2540 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
2541 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
2542 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
2543 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
2544 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
2545 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
2546 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
2547
2548 // The number of shll instructions for the extension.
2549 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
2550 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
2551 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2552 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2553 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
2554 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
2555 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2556 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2557 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
2558 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
2559 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
2560 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
2561 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2562 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2563 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
2564 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
2565
2566 // LowerVectorINT_TO_FP:
2567 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2568 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2569 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2570 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2571 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2572 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2573
2574 // Complex: to v2f32
2575 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
2576 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
2577 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
2578 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
2579 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
2580 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
2581
2582 // Complex: to v4f32
2583 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
2584 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
2585 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
2586 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
2587
2588 // Complex: to v8f32
2589 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
2590 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2591 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
2592 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2593
2594 // Complex: to v16f32
2595 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
2596 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
2597
2598 // Complex: to v2f64
2599 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
2600 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
2601 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
2602 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
2603 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
2604 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
2605
2606 // Complex: to v4f64
2607 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
2608 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
2609
2610 // LowerVectorFP_TO_INT
2611 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
2612 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
2613 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
2614 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
2615 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
2616 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
2617
2618 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
2619 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
2620 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
2621 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
2622 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
2623 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
2624 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
2625
2626 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
2627 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
2628 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
2629 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
2630 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
2631
2632 // Complex, from nxv2f32.
2633 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2634 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2635 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2636 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
2637 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2638 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2639 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2640 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
2641
2642 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
2643 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
2644 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
2645 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
2646 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
2647 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
2648 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
2649
2650 // Complex, from nxv2f64.
2651 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2652 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2653 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2654 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
2655 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2656 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2657 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2658 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
2659
2660 // Complex, from nxv4f32.
2661 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2662 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2663 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2664 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
2665 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2666 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2667 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2668 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
2669
2670 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
2671 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2672 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2673 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2674 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2675
2676 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
2677 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2678 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2679 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2680 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2681 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2682 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2683
2684 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
2685 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2686 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2687 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2688 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2689
2690 // Complex, from nxv8f16.
2691 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2692 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2693 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2694 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2695 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2696 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2697 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2698 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2699
2700 // Complex, from nxv4f16.
2701 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2702 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2703 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2704 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2705 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2706 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2707 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2708 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2709
2710 // Complex, from nxv2f16.
2711 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2712 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2713 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2714 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2715 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2716 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2717 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2718 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2719
2720 // Truncate from nxvmf32 to nxvmf16.
2721 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
2722 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
2723 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
2724
2725 // Truncate from nxvmf64 to nxvmf16.
2726 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
2727 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
2728 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
2729
2730 // Truncate from nxvmf64 to nxvmf32.
2731 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
2732 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
2733 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
2734
2735 // Extend from nxvmf16 to nxvmf32.
2736 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
2737 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
2738 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
2739
2740 // Extend from nxvmf16 to nxvmf64.
2741 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
2742 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
2743 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
2744
2745 // Extend from nxvmf32 to nxvmf64.
2746 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
2747 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
2748 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
2749
2750 // Bitcasts from float to integer
2751 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 },
2752 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 },
2753 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 },
2754
2755 // Bitcasts from integer to float
2756 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 },
2757 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 },
2758 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 },
2759
2760 // Add cost for extending to illegal -too wide- scalable vectors.
2761 // zero/sign extend are implemented by multiple unpack operations,
2762 // where each operation has a cost of 1.
2763 { ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
2764 { ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
2765 { ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
2766 { ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
2767 { ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
2768 { ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
2769
2770 { ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
2771 { ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
2772 { ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
2773 { ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
2774 { ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
2775 { ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
2776 };
2777
2778 // We have to estimate a cost of fixed length operation upon
2779 // SVE registers(operations) with the number of registers required
2780 // for a fixed type to be represented upon SVE registers.
2781 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
2782 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
2783 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
2784 ST->useSVEForFixedLengthVectors(WiderTy)) {
2785 std::pair<InstructionCost, MVT> LT =
2786 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
2787 unsigned NumElements = AArch64::SVEBitsPerBlock /
2788 LT.second.getScalarSizeInBits();
2789 return AdjustCost(
2790 LT.first *
2792 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
2793 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
2794 CostKind, I));
2795 }
2796
2797 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
2798 DstTy.getSimpleVT(),
2799 SrcTy.getSimpleVT()))
2800 return AdjustCost(Entry->Cost);
2801
2802 static const TypeConversionCostTblEntry FP16Tbl[] = {
2803 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
2804 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
2805 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
2806 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
2807 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
2808 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
2809 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
2810 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
2811 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
2812 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
2813 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
2814 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
2815 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
2816 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
2817 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
2818 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
2819 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
2820 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
2821 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
2822 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
2823 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
2824 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
2825 };
2826
2827 if (ST->hasFullFP16())
2828 if (const auto *Entry = ConvertCostTableLookup(
2829 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
2830 return AdjustCost(Entry->Cost);
2831
2832 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
2835 TLI->getTypeAction(Src->getContext(), SrcTy) ==
2837 TLI->getTypeAction(Dst->getContext(), DstTy) ==
2839 // The standard behaviour in the backend for these cases is to split the
2840 // extend up into two parts:
2841 // 1. Perform an extending load or masked load up to the legal type.
2842 // 2. Extend the loaded data to the final type.
2843 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
2844 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
2846 Opcode, LegalTy, Src, CCH, CostKind, I);
2848 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
2849 return Part1 + Part2;
2850 }
2851
2852 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
2853 // but we also want to include the TTI::CastContextHint::Masked case too.
2854 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
2856 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
2858
2859 return AdjustCost(
2860 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2861}
2862
2864 Type *Dst,
2865 VectorType *VecTy,
2866 unsigned Index) {
2867
2868 // Make sure we were given a valid extend opcode.
2869 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
2870 "Invalid opcode");
2871
2872 // We are extending an element we extract from a vector, so the source type
2873 // of the extend is the element type of the vector.
2874 auto *Src = VecTy->getElementType();
2875
2876 // Sign- and zero-extends are for integer types only.
2877 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
2878
2879 // Get the cost for the extract. We compute the cost (if any) for the extend
2880 // below.
2882 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
2883 CostKind, Index, nullptr, nullptr);
2884
2885 // Legalize the types.
2886 auto VecLT = getTypeLegalizationCost(VecTy);
2887 auto DstVT = TLI->getValueType(DL, Dst);
2888 auto SrcVT = TLI->getValueType(DL, Src);
2889
2890 // If the resulting type is still a vector and the destination type is legal,
2891 // we may get the extension for free. If not, get the default cost for the
2892 // extend.
2893 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
2894 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2895 CostKind);
2896
2897 // The destination type should be larger than the element type. If not, get
2898 // the default cost for the extend.
2899 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2900 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2901 CostKind);
2902
2903 switch (Opcode) {
2904 default:
2905 llvm_unreachable("Opcode should be either SExt or ZExt");
2906
2907 // For sign-extends, we only need a smov, which performs the extension
2908 // automatically.
2909 case Instruction::SExt:
2910 return Cost;
2911
2912 // For zero-extends, the extend is performed automatically by a umov unless
2913 // the destination type is i64 and the element type is i8 or i16.
2914 case Instruction::ZExt:
2915 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
2916 return Cost;
2917 }
2918
2919 // If we are unable to perform the extend for free, get the default cost.
2920 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2921 CostKind);
2922}
2923
2926 const Instruction *I) {
2928 return Opcode == Instruction::PHI ? 0 : 1;
2929 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
2930 // Branches are assumed to be predicted.
2931 return 0;
2932}
2933
2934InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
2935 Type *Val,
2936 unsigned Index,
2937 bool HasRealUse) {
2938 assert(Val->isVectorTy() && "This must be a vector type");
2939
2940 if (Index != -1U) {
2941 // Legalize the type.
2942 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2943
2944 // This type is legalized to a scalar type.
2945 if (!LT.second.isVector())
2946 return 0;
2947
2948 // The type may be split. For fixed-width vectors we can normalize the
2949 // index to the new type.
2950 if (LT.second.isFixedLengthVector()) {
2951 unsigned Width = LT.second.getVectorNumElements();
2952 Index = Index % Width;
2953 }
2954
2955 // The element at index zero is already inside the vector.
2956 // - For a physical (HasRealUse==true) insert-element or extract-element
2957 // instruction that extracts integers, an explicit FPR -> GPR move is
2958 // needed. So it has non-zero cost.
2959 // - For the rest of cases (virtual instruction or element type is float),
2960 // consider the instruction free.
2961 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
2962 return 0;
2963
2964 // This is recognising a LD1 single-element structure to one lane of one
2965 // register instruction. I.e., if this is an `insertelement` instruction,
2966 // and its second operand is a load, then we will generate a LD1, which
2967 // are expensive instructions.
2968 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
2969 return ST->getVectorInsertExtractBaseCost() + 1;
2970
2971 // i1 inserts and extract will include an extra cset or cmp of the vector
2972 // value. Increase the cost by 1 to account.
2973 if (Val->getScalarSizeInBits() == 1)
2974 return ST->getVectorInsertExtractBaseCost() + 1;
2975
2976 // FIXME:
2977 // If the extract-element and insert-element instructions could be
2978 // simplified away (e.g., could be combined into users by looking at use-def
2979 // context), they have no cost. This is not done in the first place for
2980 // compile-time considerations.
2981 }
2982
2983 // All other insert/extracts cost this much.
2984 return ST->getVectorInsertExtractBaseCost();
2985}
2986
2989 unsigned Index, Value *Op0,
2990 Value *Op1) {
2991 bool HasRealUse =
2992 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
2993 return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse);
2994}
2995
2997 Type *Val,
2999 unsigned Index) {
3000 return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
3001}
3002
3004 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
3006 if (isa<ScalableVectorType>(Ty))
3008 if (Ty->getElementType()->isFloatingPointTy())
3009 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
3010 CostKind);
3011 return DemandedElts.popcount() * (Insert + Extract) *
3013}
3014
3016 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
3019 const Instruction *CxtI) {
3020
3021 // TODO: Handle more cost kinds.
3023 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3024 Op2Info, Args, CxtI);
3025
3026 // Legalize the type.
3027 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3028 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3029
3030 switch (ISD) {
3031 default:
3032 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3033 Op2Info);
3034 case ISD::SDIV:
3035 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
3036 // On AArch64, scalar signed division by constants power-of-two are
3037 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
3038 // The OperandValue properties many not be same as that of previous
3039 // operation; conservatively assume OP_None.
3041 Instruction::Add, Ty, CostKind,
3042 Op1Info.getNoProps(), Op2Info.getNoProps());
3043 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
3044 Op1Info.getNoProps(), Op2Info.getNoProps());
3046 Instruction::Select, Ty, CostKind,
3047 Op1Info.getNoProps(), Op2Info.getNoProps());
3048 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
3049 Op1Info.getNoProps(), Op2Info.getNoProps());
3050 return Cost;
3051 }
3052 [[fallthrough]];
3053 case ISD::UDIV: {
3054 if (Op2Info.isConstant() && Op2Info.isUniform()) {
3055 auto VT = TLI->getValueType(DL, Ty);
3056 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
3057 // Vector signed division by constant are expanded to the
3058 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
3059 // to MULHS + SUB + SRL + ADD + SRL.
3061 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3063 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3065 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
3066 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
3067 }
3068 }
3069
3071 Opcode, Ty, CostKind, Op1Info, Op2Info);
3072 if (Ty->isVectorTy()) {
3073 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
3074 // SDIV/UDIV operations are lowered using SVE, then we can have less
3075 // costs.
3076 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
3077 ->getPrimitiveSizeInBits()
3078 .getFixedValue() < 128) {
3079 EVT VT = TLI->getValueType(DL, Ty);
3080 static const CostTblEntry DivTbl[]{
3081 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
3082 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
3083 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
3084 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
3085 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
3086 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
3087
3088 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
3089 if (nullptr != Entry)
3090 return Entry->Cost;
3091 }
3092 // For 8/16-bit elements, the cost is higher because the type
3093 // requires promotion and possibly splitting:
3094 if (LT.second.getScalarType() == MVT::i8)
3095 Cost *= 8;
3096 else if (LT.second.getScalarType() == MVT::i16)
3097 Cost *= 4;
3098 return Cost;
3099 } else {
3100 // If one of the operands is a uniform constant then the cost for each
3101 // element is Cost for insertion, extraction and division.
3102 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
3103 // operation with scalar type
3104 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
3105 (Op2Info.isConstant() && Op2Info.isUniform())) {
3106 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
3108 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
3109 return (4 + DivCost) * VTy->getNumElements();
3110 }
3111 }
3112 // On AArch64, without SVE, vector divisions are expanded
3113 // into scalar divisions of each pair of elements.
3114 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
3115 CostKind, Op1Info, Op2Info);
3116 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
3117 Op1Info, Op2Info);
3118 }
3119
3120 // TODO: if one of the arguments is scalar, then it's not necessary to
3121 // double the cost of handling the vector elements.
3122 Cost += Cost;
3123 }
3124 return Cost;
3125 }
3126 case ISD::MUL:
3127 // When SVE is available, then we can lower the v2i64 operation using
3128 // the SVE mul instruction, which has a lower cost.
3129 if (LT.second == MVT::v2i64 && ST->hasSVE())
3130 return LT.first;
3131
3132 // When SVE is not available, there is no MUL.2d instruction,
3133 // which means mul <2 x i64> is expensive as elements are extracted
3134 // from the vectors and the muls scalarized.
3135 // As getScalarizationOverhead is a bit too pessimistic, we
3136 // estimate the cost for a i64 vector directly here, which is:
3137 // - four 2-cost i64 extracts,
3138 // - two 2-cost i64 inserts, and
3139 // - two 1-cost muls.
3140 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
3141 // LT.first = 2 the cost is 28. If both operands are extensions it will not
3142 // need to scalarize so the cost can be cheaper (smull or umull).
3143 // so the cost can be cheaper (smull or umull).
3144 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
3145 return LT.first;
3146 return LT.first * 14;
3147 case ISD::ADD:
3148 case ISD::XOR:
3149 case ISD::OR:
3150 case ISD::AND:
3151 case ISD::SRL:
3152 case ISD::SRA:
3153 case ISD::SHL:
3154 // These nodes are marked as 'custom' for combining purposes only.
3155 // We know that they are legal. See LowerAdd in ISelLowering.
3156 return LT.first;
3157
3158 case ISD::FNEG:
3159 case ISD::FADD:
3160 case ISD::FSUB:
3161 // Increase the cost for half and bfloat types if not architecturally
3162 // supported.
3163 if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
3164 (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
3165 return 2 * LT.first;
3166 if (!Ty->getScalarType()->isFP128Ty())
3167 return LT.first;
3168 [[fallthrough]];
3169 case ISD::FMUL:
3170 case ISD::FDIV:
3171 // These nodes are marked as 'custom' just to lower them to SVE.
3172 // We know said lowering will incur no additional cost.
3173 if (!Ty->getScalarType()->isFP128Ty())
3174 return 2 * LT.first;
3175
3176 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3177 Op2Info);
3178 case ISD::FREM:
3179 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
3180 // those functions are not declared in the module.
3181 if (!Ty->isVectorTy())
3182 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
3183 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
3184 Op2Info);
3185 }
3186}
3187
3189 ScalarEvolution *SE,
3190 const SCEV *Ptr) {
3191 // Address computations in vectorized code with non-consecutive addresses will
3192 // likely result in more instructions compared to scalar code where the
3193 // computation can more often be merged into the index mode. The resulting
3194 // extra micro-ops can significantly decrease throughput.
3195 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
3196 int MaxMergeDistance = 64;
3197
3198 if (Ty->isVectorTy() && SE &&
3199 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
3200 return NumVectorInstToHideOverhead;
3201
3202 // In many cases the address computation is not merged into the instruction
3203 // addressing mode.
3204 return 1;
3205}
3206
3208 Type *CondTy,
3209 CmpInst::Predicate VecPred,
3211 const Instruction *I) {
3212 // TODO: Handle other cost kinds.
3214 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
3215 I);
3216
3217 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3218 // We don't lower some vector selects well that are wider than the register
3219 // width.
3220 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
3221 // We would need this many instructions to hide the scalarization happening.
3222 const int AmortizationCost = 20;
3223
3224 // If VecPred is not set, check if we can get a predicate from the context
3225 // instruction, if its type matches the requested ValTy.
3226 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
3227 CmpInst::Predicate CurrentPred;
3228 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
3229 m_Value())))
3230 VecPred = CurrentPred;
3231 }
3232 // Check if we have a compare/select chain that can be lowered using
3233 // a (F)CMxx & BFI pair.
3234 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
3235 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
3236 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
3237 VecPred == CmpInst::FCMP_UNE) {
3238 static const auto ValidMinMaxTys = {
3239 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
3240 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
3241 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
3242
3243 auto LT = getTypeLegalizationCost(ValTy);
3244 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
3245 (ST->hasFullFP16() &&
3246 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
3247 return LT.first;
3248 }
3249
3250 static const TypeConversionCostTblEntry
3251 VectorSelectTbl[] = {
3252 { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 },
3253 { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 },
3254 { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 },
3255 { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 },
3256 { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 },
3257 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
3258 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
3259 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
3260 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
3261 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
3262 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
3263 };
3264
3265 EVT SelCondTy = TLI->getValueType(DL, CondTy);
3266 EVT SelValTy = TLI->getValueType(DL, ValTy);
3267 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
3268 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
3269 SelCondTy.getSimpleVT(),
3270 SelValTy.getSimpleVT()))
3271 return Entry->Cost;
3272 }
3273 }
3274
3275 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
3276 auto LT = getTypeLegalizationCost(ValTy);
3277 // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
3278 if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3279 return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn
3280 }
3281
3282 // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
3283 // FIXME: This can apply to more conditions and add/sub if it can be shown to
3284 // be profitable.
3285 if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
3286 ICmpInst::isEquality(VecPred) &&
3287 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
3288 match(I->getOperand(1), m_Zero()) &&
3289 match(I->getOperand(0), m_And(m_Value(), m_Value())))
3290 return 0;
3291
3292 // The base case handles scalable vectors fine for now, since it treats the
3293 // cost as 1 * legalization cost.
3294 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
3295}
3296
3298AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3300 if (ST->requiresStrictAlign()) {
3301 // TODO: Add cost modeling for strict align. Misaligned loads expand to
3302 // a bunch of instructions when strict align is enabled.
3303 return Options;
3304 }
3305 Options.AllowOverlappingLoads = true;
3306 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3307 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3308 // TODO: Though vector loads usually perform well on AArch64, in some targets
3309 // they may wake up the FP unit, which raises the power consumption. Perhaps
3310 // they could be used with no holds barred (-O3).
3311 Options.LoadSizes = {8, 4, 2, 1};
3312 Options.AllowedTailExpansions = {3, 5, 6};
3313 return Options;
3314}
3315
3317 return ST->hasSVE();
3318}
3319
3322 Align Alignment, unsigned AddressSpace,
3324 if (useNeonVector(Src))
3325 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
3326 CostKind);
3327 auto LT = getTypeLegalizationCost(Src);
3328 if (!LT.first.isValid())
3330
3331 // Return an invalid cost for element types that we are unable to lower.
3332 auto *VT = cast<VectorType>(Src);
3333 if (VT->getElementType()->isIntegerTy(1))
3335
3336 // The code-generator is currently not able to handle scalable vectors
3337 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3338 // it. This change will be removed when code-generation for these types is
3339 // sufficiently reliable.
3340 if (VT->getElementCount() == ElementCount::getScalable(1))
3342
3343 return LT.first;
3344}
3345
3346static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
3347 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
3348}
3349
3351 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
3352 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
3353 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
3354 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
3355 Alignment, CostKind, I);
3356 auto *VT = cast<VectorType>(DataTy);
3357 auto LT = getTypeLegalizationCost(DataTy);
3358 if (!LT.first.isValid())
3360
3361 // Return an invalid cost for element types that we are unable to lower.
3362 if (!LT.second.isVector() ||
3363 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
3364 VT->getElementType()->isIntegerTy(1))
3366
3367 // The code-generator is currently not able to handle scalable vectors
3368 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3369 // it. This change will be removed when code-generation for these types is
3370 // sufficiently reliable.
3371 if (VT->getElementCount() == ElementCount::getScalable(1))
3373
3374 ElementCount LegalVF = LT.second.getVectorElementCount();
3375 InstructionCost MemOpCost =
3376 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
3377 {TTI::OK_AnyValue, TTI::OP_None}, I);
3378 // Add on an overhead cost for using gathers/scatters.
3379 // TODO: At the moment this is applied unilaterally for all CPUs, but at some
3380 // point we may want a per-CPU overhead.
3381 MemOpCost *= getSVEGatherScatterOverhead(Opcode);
3382 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
3383}
3384
3386 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
3387}
3388
3390 MaybeAlign Alignment,
3391 unsigned AddressSpace,
3393 TTI::OperandValueInfo OpInfo,
3394 const Instruction *I) {
3395 EVT VT = TLI->getValueType(DL, Ty, true);
3396 // Type legalization can't handle structs
3397 if (VT == MVT::Other)
3398 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
3399 CostKind);
3400
3401 auto LT = getTypeLegalizationCost(Ty);
3402 if (!LT.first.isValid())
3404
3405 // The code-generator is currently not able to handle scalable vectors
3406 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3407 // it. This change will be removed when code-generation for these types is
3408 // sufficiently reliable.
3409 // We also only support full register predicate loads and stores.
3410 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3411 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
3412 (VTy->getElementType()->isIntegerTy(1) &&
3413 !VTy->getElementCount().isKnownMultipleOf(
3416
3417 // TODO: consider latency as well for TCK_SizeAndLatency.
3419 return LT.first;
3420
3422 return 1;
3423
3424 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3425 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
3426 // Unaligned stores are extremely inefficient. We don't split all
3427 // unaligned 128-bit stores because the negative impact that has shown in
3428 // practice on inlined block copy code.
3429 // We make such stores expensive so that we will only vectorize if there
3430 // are 6 other instructions getting vectorized.
3431 const int AmortizationCost = 6;
3432
3433 return LT.first * 2 * AmortizationCost;
3434 }
3435
3436 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
3437 if (Ty->isPtrOrPtrVectorTy())
3438 return LT.first;
3439
3440 if (useNeonVector(Ty)) {
3441 // Check truncating stores and extending loads.
3442 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3443 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
3444 if (VT == MVT::v4i8)
3445 return 2;
3446 // Otherwise we need to scalarize.
3447 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3448 }
3449 EVT EltVT = VT.getVectorElementType();
3450 unsigned EltSize = EltVT.getScalarSizeInBits();
3451 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
3452 VT.getVectorNumElements() >= (128 / EltSize) || !Alignment ||
3453 *Alignment != Align(1))
3454 return LT.first;
3455 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
3456 // widening to v4i8, which produces suboptimal results.
3457 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
3458 return LT.first;
3459
3460 // Check non-power-of-2 loads/stores for legal vector element types with
3461 // NEON. Non-power-of-2 memory ops will get broken down to a set of
3462 // operations on smaller power-of-2 ops, including ld1/st1.
3463 LLVMContext &C = Ty->getContext();
3465 SmallVector<EVT> TypeWorklist;
3466 TypeWorklist.push_back(VT);
3467 while (!TypeWorklist.empty()) {
3468 EVT CurrVT = TypeWorklist.pop_back_val();
3469 unsigned CurrNumElements = CurrVT.getVectorNumElements();
3470 if (isPowerOf2_32(CurrNumElements)) {
3471 Cost += 1;
3472 continue;
3473 }
3474
3475 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
3476 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
3477 TypeWorklist.push_back(
3478 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
3479 }
3480 return Cost;
3481 }
3482
3483 return LT.first;
3484}
3485
3487 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
3488 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
3489 bool UseMaskForCond, bool UseMaskForGaps) {
3490 assert(Factor >= 2 && "Invalid interleave factor");
3491 auto *VecVTy = cast<VectorType>(VecTy);
3492
3493 if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2))
3495
3496 // Vectorization for masked interleaved accesses is only enabled for scalable
3497 // VF.
3498 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
3500
3501 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
3502 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
3503 auto *SubVecTy =
3504 VectorType::get(VecVTy->getElementType(),
3505 VecVTy->getElementCount().divideCoefficientBy(Factor));
3506
3507 // ldN/stN only support legal vector types of size 64 or 128 in bits.
3508 // Accesses having vector types that are a multiple of 128 bits can be
3509 // matched to more than one ldN/stN instruction.
3510 bool UseScalable;
3511 if (MinElts % Factor == 0 &&
3512 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
3513 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
3514 }
3515
3516 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3517 Alignment, AddressSpace, CostKind,
3518 UseMaskForCond, UseMaskForGaps);
3519}
3520
3525 for (auto *I : Tys) {
3526 if (!I->isVectorTy())
3527 continue;
3528 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
3529 128)
3530 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
3531 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
3532 }
3533 return Cost;
3534}
3535
3537 return ST->getMaxInterleaveFactor();
3538}
3539
3540// For Falkor, we want to avoid having too many strided loads in a loop since
3541// that can exhaust the HW prefetcher resources. We adjust the unroller
3542// MaxCount preference below to attempt to ensure unrolling doesn't create too
3543// many strided loads.
3544static void
3547 enum { MaxStridedLoads = 7 };
3548 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
3549 int StridedLoads = 0;
3550 // FIXME? We could make this more precise by looking at the CFG and
3551 // e.g. not counting loads in each side of an if-then-else diamond.
3552 for (const auto BB : L->blocks()) {
3553 for (auto &I : *BB) {
3554 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
3555 if (!LMemI)
3556 continue;
3557
3558 Value *PtrValue = LMemI->getPointerOperand();
3559 if (L->isLoopInvariant(PtrValue))
3560 continue;
3561
3562 const SCEV *LSCEV = SE.getSCEV(PtrValue);
3563 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
3564 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
3565 continue;
3566
3567 // FIXME? We could take pairing of unrolled load copies into account
3568 // by looking at the AddRec, but we would probably have to limit this
3569 // to loops with no stores or other memory optimization barriers.
3570 ++StridedLoads;
3571 // We've seen enough strided loads that seeing more won't make a
3572 // difference.
3573 if (StridedLoads > MaxStridedLoads / 2)
3574 return StridedLoads;
3575 }
3576 }
3577 return StridedLoads;
3578 };
3579
3580 int StridedLoads = countStridedLoads(L, SE);
3581 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
3582 << " strided loads\n");
3583 // Pick the largest power of 2 unroll count that won't result in too many
3584 // strided loads.
3585 if (StridedLoads) {
3586 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
3587 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
3588 << UP.MaxCount << '\n');
3589 }
3590}
3591
3595 // Enable partial unrolling and runtime unrolling.
3596 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
3597
3598 UP.UpperBound = true;
3599
3600 // For inner loop, it is more likely to be a hot one, and the runtime check
3601 // can be promoted out from LICM pass, so the overhead is less, let's try
3602 // a larger threshold to unroll more loops.
3603 if (L->getLoopDepth() > 1)
3604 UP.PartialThreshold *= 2;
3605
3606 // Disable partial & runtime unrolling on -Os.
3608
3609 if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
3612
3613 // Scan the loop: don't unroll loops with calls as this could prevent
3614 // inlining. Don't unroll vector loops either, as they don't benefit much from
3615 // unrolling.
3616 for (auto *BB : L->getBlocks()) {
3617 for (auto &I : *BB) {
3618 // Don't unroll vectorised loop.
3619 if (I.getType()->isVectorTy())
3620 return;
3621
3622 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
3623 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
3624 if (!isLoweredToCall(F))
3625 continue;
3626 }
3627 return;
3628 }
3629 }
3630 }
3631
3632 // Enable runtime unrolling for in-order models
3633 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
3634 // checking for that case, we can ensure that the default behaviour is
3635 // unchanged
3637 !ST->getSchedModel().isOutOfOrder()) {
3638 UP.Runtime = true;
3639 UP.Partial = true;
3640 UP.UnrollRemainder = true;
3642
3643 UP.UnrollAndJam = true;
3645 }
3646}
3647
3651}
3652
3654 Type *ExpectedType) {
3655 switch (Inst->getIntrinsicID()) {
3656 default:
3657 return nullptr;
3658 case Intrinsic::aarch64_neon_st2:
3659 case Intrinsic::aarch64_neon_st3:
3660 case Intrinsic::aarch64_neon_st4: {
3661 // Create a struct type
3662 StructType *ST = dyn_cast<StructType>(ExpectedType);
3663 if (!ST)
3664 return nullptr;
3665 unsigned NumElts = Inst->arg_size() - 1;
3666 if (ST->getNumElements() != NumElts)
3667 return nullptr;
3668 for (unsigned i = 0, e = NumElts; i != e; ++i) {
3669 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
3670 return nullptr;
3671 }
3672 Value *Res = PoisonValue::get(ExpectedType);
3673 IRBuilder<> Builder(Inst);
3674 for (unsigned i = 0, e = NumElts; i != e; ++i) {
3675 Value *L = Inst->getArgOperand(i);
3676 Res = Builder.CreateInsertValue(Res, L, i);
3677 }
3678 return Res;
3679 }
3680 case Intrinsic::aarch64_neon_ld2:
3681 case Intrinsic::aarch64_neon_ld3:
3682 case Intrinsic::aarch64_neon_ld4:
3683 if (Inst->getType() == ExpectedType)
3684 return Inst;
3685 return nullptr;
3686 }
3687}
3688
3690 MemIntrinsicInfo &Info) {
3691 switch (Inst->getIntrinsicID()) {
3692 default:
3693 break;
3694 case Intrinsic::aarch64_neon_ld2:
3695 case Intrinsic::aarch64_neon_ld3:
3696 case Intrinsic::aarch64_neon_ld4:
3697 Info.ReadMem = true;
3698 Info.WriteMem = false;
3699 Info.PtrVal = Inst->getArgOperand(0);
3700 break;
3701 case Intrinsic::aarch64_neon_st2:
3702 case Intrinsic::aarch64_neon_st3:
3703 case Intrinsic::aarch64_neon_st4:
3704 Info.ReadMem = false;
3705 Info.WriteMem = true;
3706 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
3707 break;
3708 }
3709
3710 switch (Inst->getIntrinsicID()) {
3711 default:
3712 return false;
3713 case Intrinsic::aarch64_neon_ld2:
3714 case Intrinsic::aarch64_neon_st2:
3715 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
3716 break;
3717 case Intrinsic::aarch64_neon_ld3:
3718 case Intrinsic::aarch64_neon_st3:
3719 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
3720 break;
3721 case Intrinsic::aarch64_neon_ld4:
3722 case Intrinsic::aarch64_neon_st4:
3723 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
3724 break;
3725 }
3726 return true;
3727}
3728
3729/// See if \p I should be considered for address type promotion. We check if \p
3730/// I is a sext with right type and used in memory accesses. If it used in a
3731/// "complex" getelementptr, we allow it to be promoted without finding other
3732/// sext instructions that sign extended the same initial value. A getelementptr
3733/// is considered as "complex" if it has more than 2 operands.
3735 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
3736 bool Considerable = false;
3737 AllowPromotionWithoutCommonHeader = false;
3738 if (!isa<SExtInst>(&I))
3739 return false;
3740 Type *ConsideredSExtType =
3741 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3742 if (I.getType() != ConsideredSExtType)
3743 return false;
3744 // See if the sext is the one with the right type and used in at least one
3745 // GetElementPtrInst.
3746 for (const User *U : I.users()) {
3747 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3748 Considerable = true;
3749 // A getelementptr is considered as "complex" if it has more than 2
3750 // operands. We will promote a SExt used in such complex GEP as we
3751 // expect some computation to be merged if they are done on 64 bits.
3752 if (GEPInst->getNumOperands() > 2) {
3753 AllowPromotionWithoutCommonHeader = true;
3754 break;
3755 }
3756 }
3757 }
3758 return Considerable;
3759}
3760
3762 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
3763 if (!VF.isScalable())
3764 return true;
3765
3766 Type *Ty = RdxDesc.getRecurrenceType();
3768 return false;
3769
3770 switch (RdxDesc.getRecurrenceKind()) {
3771 case RecurKind::Add:
3772 case RecurKind::FAdd:
3773 case RecurKind::And:
3774 case RecurKind::Or:
3775 case RecurKind::Xor:
3776 case RecurKind::SMin:
3777 case RecurKind::SMax:
3778 case RecurKind::UMin:
3779 case RecurKind::UMax:
3780 case RecurKind::FMin:
3781 case RecurKind::FMax:
3782 case RecurKind::FMulAdd:
3783 case RecurKind::IAnyOf:
3784 case RecurKind::FAnyOf:
3785 return true;
3786 default:
3787 return false;
3788 }
3789}
3790
3793 FastMathFlags FMF,
3795 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3796
3797 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
3798 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
3799
3800 InstructionCost LegalizationCost = 0;
3801 if (LT.first > 1) {
3802 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
3803 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
3804 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
3805 }
3806
3807 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
3808}
3809
3811 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
3812 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3813 InstructionCost LegalizationCost = 0;
3814 if (LT.first > 1) {
3815 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
3816 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
3817 LegalizationCost *= LT.first - 1;
3818 }
3819
3820 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3821 assert(ISD && "Invalid opcode");
3822 // Add the final reduction cost for the legal horizontal reduction
3823 switch (ISD) {
3824 case ISD::ADD:
3825 case ISD::AND:
3826 case ISD::OR:
3827 case ISD::XOR:
3828 case ISD::FADD:
3829 return LegalizationCost + 2;
3830 default:
3832 }
3833}
3834
3837 std::optional<FastMathFlags> FMF,
3840 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
3841 InstructionCost BaseCost =
3842 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3843 // Add on extra cost to reflect the extra overhead on some CPUs. We still
3844 // end up vectorizing for more computationally intensive loops.
3845 return BaseCost + FixedVTy->getNumElements();
3846 }
3847
3848 if (Opcode != Instruction::FAdd)
3850
3851 auto *VTy = cast<ScalableVectorType>(ValTy);
3853 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
3854 Cost *= getMaxNumElements(VTy->getElementCount());
3855 return Cost;
3856 }
3857
3858 if (isa<ScalableVectorType>(ValTy))
3859 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
3860
3861 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3862 MVT MTy = LT.second;
3863 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3864 assert(ISD && "Invalid opcode");
3865
3866 // Horizontal adds can use the 'addv' instruction. We model the cost of these
3867 // instructions as twice a normal vector add, plus 1 for each legalization
3868 // step (LT.first). This is the only arithmetic vector reduction operation for
3869 // which we have an instruction.
3870 // OR, XOR and AND costs should match the codegen from:
3871 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
3872 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
3873 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
3874 static const CostTblEntry CostTblNoPairwise[]{
3875 {ISD::ADD, MVT::v8i8, 2},
3876 {ISD::ADD, MVT::v16i8, 2},
3877 {ISD::ADD, MVT::v4i16, 2},
3878 {ISD::ADD, MVT::v8i16, 2},
3879 {ISD::ADD, MVT::v4i32, 2},
3880 {ISD::ADD, MVT::v2i64, 2},
3881 {ISD::OR, MVT::v8i8, 15},
3882 {ISD::OR, MVT::v16i8, 17},
3883 {ISD::OR, MVT::v4i16, 7},
3884 {ISD::OR, MVT::v8i16, 9},
3885 {ISD::OR, MVT::v2i32, 3},
3886 {ISD::OR, MVT::v4i32, 5},
3887 {ISD::OR, MVT::v2i64, 3},
3888 {ISD::XOR, MVT::v8i8, 15},
3889 {ISD::XOR, MVT::v16i8, 17},
3890 {ISD::XOR, MVT::v4i16, 7},
3891 {ISD::XOR, MVT::v8i16, 9},
3892 {ISD::XOR, MVT::v2i32, 3},
3893 {ISD::XOR, MVT::v4i32, 5},
3894 {ISD::XOR, MVT::v2i64, 3},
3895 {ISD::AND, MVT::v8i8, 15},
3896 {ISD::AND, MVT::v16i8, 17},
3897 {ISD::AND, MVT::v4i16, 7},
3898 {ISD::AND, MVT::v8i16, 9},
3899 {ISD::AND, MVT::v2i32, 3},
3900 {ISD::AND, MVT::v4i32, 5},
3901 {ISD::AND, MVT::v2i64, 3},
3902 };
3903 switch (ISD) {
3904 default:
3905 break;
3906 case ISD::ADD:
3907 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
3908 return (LT.first - 1) + Entry->Cost;
3909 break;
3910 case ISD::XOR:
3911 case ISD::AND:
3912 case ISD::OR:
3913 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
3914 if (!Entry)
3915 break;
3916 auto *ValVTy = cast<FixedVectorType>(ValTy);
3917 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
3918 isPowerOf2_32(ValVTy->getNumElements())) {
3919 InstructionCost ExtraCost = 0;
3920 if (LT.first != 1) {
3921 // Type needs to be split, so there is an extra cost of LT.first - 1
3922 // arithmetic ops.
3923 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
3924 MTy.getVectorNumElements());
3925 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3926 ExtraCost *= LT.first - 1;
3927 }
3928 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
3929 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
3930 return Cost + ExtraCost;
3931 }
3932 break;
3933 }
3934 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3935}
3936
3938 static const CostTblEntry ShuffleTbl[] = {
3939 { TTI::SK_Splice, MVT::nxv16i8, 1 },
3940 { TTI::SK_Splice, MVT::nxv8i16, 1 },
3941 { TTI::SK_Splice, MVT::nxv4i32, 1 },
3942 { TTI::SK_Splice, MVT::nxv2i64, 1 },
3943 { TTI::SK_Splice, MVT::nxv2f16, 1 },
3944 { TTI::SK_Splice, MVT::nxv4f16, 1 },
3945 { TTI::SK_Splice, MVT::nxv8f16, 1 },
3946 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
3947 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
3948 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
3949 { TTI::SK_Splice, MVT::nxv2f32, 1 },
3950 { TTI::SK_Splice, MVT::nxv4f32, 1 },
3951 { TTI::SK_Splice, MVT::nxv2f64, 1 },
3952 };
3953
3954 // The code-generator is currently not able to handle scalable vectors
3955 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3956 // it. This change will be removed when code-generation for these types is
3957 // sufficiently reliable.
3960
3961 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3962 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
3964 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
3965 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
3966 : LT.second;
3967 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
3968 InstructionCost LegalizationCost = 0;
3969 if (Index < 0) {
3970 LegalizationCost =
3971 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
3973 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
3975 }
3976
3977 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
3978 // Cost performed on a promoted type.
3979 if (LT.second.getScalarType() == MVT::i1) {
3980 LegalizationCost +=
3981 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
3983 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
3985 }
3986 const auto *Entry =
3987 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
3988 assert(Entry && "Illegal Type for Splice");
3989 LegalizationCost += Entry->Cost;
3990 return LegalizationCost * LT.first;
3991}
3992
3996 ArrayRef<const Value *> Args, const Instruction *CxtI) {
3997 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3998
3999 // If we have a Mask, and the LT is being legalized somehow, split the Mask
4000 // into smaller vectors and sum the cost of each shuffle.
4001 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
4002 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
4003 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
4004
4005 // Check for LD3/LD4 instructions, which are represented in llvm IR as
4006 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
4007 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
4008 // cost than just the load.
4009 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
4012 return std::max<InstructionCost>(1, LT.first / 4);
4013
4014 // Check for ST3/ST4 instructions, which are represented in llvm IR as
4015 // store(interleaving-shuffle). The shuffle cost could potentially be free,
4016 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
4017 // cost than just the store.
4018 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
4020 Mask, 4, Tp->getElementCount().getKnownMinValue() * 2) ||
4022 Mask, 3, Tp->getElementCount().getKnownMinValue() * 2)))
4023 return LT.first;
4024
4025 unsigned TpNumElts = Mask.size();
4026 unsigned LTNumElts = LT.second.getVectorNumElements();
4027 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
4028 VectorType *NTp =
4029 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
4031 for (unsigned N = 0; N < NumVecs; N++) {
4032 SmallVector<int> NMask;
4033 // Split the existing mask into chunks of size LTNumElts. Track the source
4034 // sub-vectors to ensure the result has at most 2 inputs.
4035 unsigned Source1, Source2;
4036 unsigned NumSources = 0;
4037 for (unsigned E = 0; E < LTNumElts; E++) {
4038 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
4040 if (MaskElt < 0) {
4042 continue;
4043 }
4044
4045 // Calculate which source from the input this comes from and whether it
4046 // is new to us.
4047 unsigned Source = MaskElt / LTNumElts;
4048 if (NumSources == 0) {
4049 Source1 = Source;
4050 NumSources = 1;
4051 } else if (NumSources == 1 && Source != Source1) {
4052 Source2 = Source;
4053 NumSources = 2;
4054 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
4055 NumSources++;
4056 }
4057
4058 // Add to the new mask. For the NumSources>2 case these are not correct,
4059 // but are only used for the modular lane number.
4060 if (Source == Source1)
4061 NMask.push_back(MaskElt % LTNumElts);
4062 else if (Source == Source2)
4063 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
4064 else
4065 NMask.push_back(MaskElt % LTNumElts);
4066 }
4067 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
4068 // getShuffleCost. If not then cost it using the worst case.
4069 if (NumSources <= 2)
4070 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
4072 NTp, NMask, CostKind, 0, nullptr, Args, CxtI);
4073 else if (any_of(enumerate(NMask), [&](const auto &ME) {
4074 return ME.value() % LTNumElts == ME.index();
4075 }))
4076 Cost += LTNumElts - 1;
4077 else
4078 Cost += LTNumElts;
4079 }
4080 return Cost;
4081 }
4082
4083 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
4084 // Treat extractsubvector as single op permutation.
4085 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
4086 if (IsExtractSubvector && LT.second.isFixedLengthVector())
4088
4089 // Check for broadcast loads, which are supported by the LD1R instruction.
4090 // In terms of code-size, the shuffle vector is free when a load + dup get
4091 // folded into a LD1R. That's what we check and return here. For performance
4092 // and reciprocal throughput, a LD1R is not completely free. In this case, we
4093 // return the cost for the broadcast below (i.e. 1 for most/all types), so
4094 // that we model the load + dup sequence slightly higher because LD1R is a
4095 // high latency instruction.
4096 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
4097 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
4098 if (IsLoad && LT.second.isVector() &&
4100 LT.second.getVectorElementCount()))
4101 return 0;
4102 }
4103
4104 // If we have 4 elements for the shuffle and a Mask, get the cost straight
4105 // from the perfect shuffle tables.
4106 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
4107 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
4108 all_of(Mask, [](int E) { return E < 8; }))
4109 return getPerfectShuffleCost(Mask);
4110
4111 // Check for identity masks, which we can treat as free.
4112 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
4113 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
4114 all_of(enumerate(Mask), [](const auto &M) {
4115 return M.value() < 0 || M.value() == (int)M.index();
4116 }))
4117 return 0;
4118
4119 // Check for other shuffles that are not SK_ kinds but we have native
4120 // instructions for, for example ZIP and UZP.
4121 unsigned Unused;
4122 if (LT.second.isFixedLengthVector() &&
4123 LT.second.getVectorNumElements() == Mask.size() &&
4124 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
4125 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4126 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
4127 // Check for non-zero lane splats
4128 all_of(drop_begin(Mask),
4129 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
4130 return 1;
4131
4132 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
4133 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
4134 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
4135 static const CostTblEntry ShuffleTbl[] = {
4136 // Broadcast shuffle kinds can be performed with 'dup'.
4137 {TTI::SK_Broadcast, MVT::v8i8, 1},
4138 {TTI::SK_Broadcast, MVT::v16i8, 1},
4139 {TTI::SK_Broadcast, MVT::v4i16, 1},
4140 {TTI::SK_Broadcast, MVT::v8i16, 1},
4141 {TTI::SK_Broadcast, MVT::v2i32, 1},
4142 {TTI::SK_Broadcast, MVT::v4i32, 1},
4143 {TTI::SK_Broadcast, MVT::v2i64, 1},
4144 {TTI::SK_Broadcast, MVT::v4f16, 1},
4145 {TTI::SK_Broadcast, MVT::v8f16, 1},
4146 {TTI::SK_Broadcast, MVT::v2f32, 1},
4147 {TTI::SK_Broadcast, MVT::v4f32, 1},
4148 {TTI::SK_Broadcast, MVT::v2f64, 1},
4149 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
4150 // 'zip1/zip2' instructions.
4151 {TTI::SK_Transpose, MVT::v8i8, 1},
4152 {TTI::SK_Transpose, MVT::v16i8, 1},
4153 {TTI::SK_Transpose, MVT::v4i16, 1},
4154 {TTI::SK_Transpose, MVT::v8i16, 1},
4155 {TTI::SK_Transpose, MVT::v2i32, 1},
4156 {TTI::SK_Transpose, MVT::v4i32, 1},
4157 {TTI::SK_Transpose, MVT::v2i64, 1},
4158 {TTI::SK_Transpose, MVT::v4f16, 1},
4159 {TTI::SK_Transpose, MVT::v8f16, 1},
4160 {TTI::SK_Transpose, MVT::v2f32, 1},
4161 {TTI::SK_Transpose, MVT::v4f32, 1},
4162 {TTI::SK_Transpose, MVT::v2f64, 1},
4163 // Select shuffle kinds.
4164 // TODO: handle vXi8/vXi16.
4165 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
4166 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
4167 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
4168 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
4169 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
4170 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
4171 // PermuteSingleSrc shuffle kinds.
4172 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
4173 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
4174 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
4175 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
4176 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
4177 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
4178 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
4179 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
4180 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
4181 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
4182 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
4183 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
4184 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
4185 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
4186 // Reverse can be lowered with `rev`.
4187 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
4188 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
4189 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
4190 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
4191 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
4192 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
4193 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
4194 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
4195 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
4196 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
4197 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
4198 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
4199 // Splice can all be lowered as `ext`.
4200 {TTI::SK_Splice, MVT::v2i32, 1},
4201 {TTI::SK_Splice, MVT::v4i32, 1},
4202 {TTI::SK_Splice, MVT::v2i64, 1},
4203 {TTI::SK_Splice, MVT::v2f32, 1},
4204 {TTI::SK_Splice, MVT::v4f32, 1},
4205 {TTI::SK_Splice, MVT::v2f64, 1},
4206 {TTI::SK_Splice, MVT::v8f16, 1},
4207 {TTI::SK_Splice, MVT::v8bf16, 1},
4208 {TTI::SK_Splice, MVT::v8i16, 1},
4209 {TTI::SK_Splice, MVT::v16i8, 1},
4210 {TTI::SK_Splice, MVT::v4bf16, 1},
4211 {TTI::SK_Splice, MVT::v4f16, 1},
4212 {TTI::SK_Splice, MVT::v4i16, 1},
4213 {TTI::SK_Splice, MVT::v8i8, 1},
4214 // Broadcast shuffle kinds for scalable vectors
4215 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
4216 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
4217 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
4218 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
4219 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
4220 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
4221 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
4222 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
4223 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
4224 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
4225 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
4226 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
4227 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
4228 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
4229 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
4230 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
4231 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
4232 // Handle the cases for vector.reverse with scalable vectors
4233 {TTI::SK_Reverse, MVT::nxv16i8, 1},
4234 {TTI::SK_Reverse, MVT::nxv8i16, 1},
4235 {TTI::SK_Reverse, MVT::nxv4i32, 1},
4236 {TTI::SK_Reverse, MVT::nxv2i64, 1},
4237 {TTI::SK_Reverse, MVT::nxv2f16, 1},
4238 {TTI::SK_Reverse, MVT::nxv4f16, 1},
4239 {TTI::SK_Reverse, MVT::nxv8f16, 1},
4240 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
4241 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
4242 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
4243 {TTI::SK_Reverse, MVT::nxv2f32, 1},
4244 {TTI::SK_Reverse, MVT::nxv4f32, 1},
4245 {TTI::SK_Reverse, MVT::nxv2f64, 1},
4246 {TTI::SK_Reverse, MVT::nxv16i1, 1},
4247 {TTI::SK_Reverse, MVT::nxv8i1, 1},
4248 {TTI::SK_Reverse, MVT::nxv4i1, 1},
4249 {TTI::SK_Reverse, MVT::nxv2i1, 1},
4250 };
4251 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
4252 return LT.first * Entry->Cost;
4253 }
4254
4255 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
4256 return getSpliceCost(Tp, Index);
4257
4258 // Inserting a subvector can often be done with either a D, S or H register
4259 // move, so long as the inserted vector is "aligned".
4260 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
4261 LT.second.getSizeInBits() <= 128 && SubTp) {
4262 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
4263 if (SubLT.second.isVector()) {
4264 int NumElts = LT.second.getVectorNumElements();
4265 int NumSubElts = SubLT.second.getVectorNumElements();
4266 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
4267 return SubLT.first;
4268 }
4269 }
4270
4271 // Restore optimal kind.
4272 if (IsExtractSubvector)
4274 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args,
4275 CxtI);
4276}
4277
4280 const auto &Strides = DenseMap<Value *, const SCEV *>();
4281 for (BasicBlock *BB : TheLoop->blocks()) {
4282 // Scan the instructions in the block and look for addresses that are
4283 // consecutive and decreasing.
4284 for (Instruction &I : *BB) {
4285 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
4287 Type *AccessTy = getLoadStoreType(&I);
4288 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
4289 /*ShouldCheckWrap=*/false)
4290 .value_or(0) < 0)
4291 return true;
4292 }
4293 }
4294 }
4295 return false;
4296}
4297
4299 if (!ST->hasSVE())
4300 return false;
4301
4302 // We don't currently support vectorisation with interleaving for SVE - with
4303 // such loops we're better off not using tail-folding. This gives us a chance
4304 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
4305 if (TFI->IAI->hasGroups())
4306 return false;
4307
4309 if (TFI->LVL->getReductionVars().size())
4310 Required |= TailFoldingOpts::Reductions;
4311 if (TFI->LVL->getFixedOrderRecurrences().size())
4312 Required |= TailFoldingOpts::Recurrences;
4313
4314 // We call this to discover whether any load/store pointers in the loop have
4315 // negative strides. This will require extra work to reverse the loop
4316 // predicate, which may be expensive.
4319 Required |= TailFoldingOpts::Reverse;
4320 if (Required == TailFoldingOpts::Disabled)
4321 Required |= TailFoldingOpts::Simple;
4322
4324 Required))
4325 return false;
4326
4327 // Don't tail-fold for tight loops where we would be better off interleaving
4328 // with an unpredicated loop.
4329 unsigned NumInsns = 0;
4330 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
4331 NumInsns += BB->sizeWithoutDebug();
4332 }
4333
4334 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
4335 return NumInsns >= SVETailFoldInsnThreshold;
4336}
4337
4340 StackOffset BaseOffset, bool HasBaseReg,
4341 int64_t Scale, unsigned AddrSpace) const {
4342 // Scaling factors are not free at all.
4343 // Operands | Rt Latency
4344 // -------------------------------------------
4345 // Rt, [Xn, Xm] | 4
4346 // -------------------------------------------
4347 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
4348 // Rt, [Xn, Wm, <extend> #imm] |
4350 AM.BaseGV = BaseGV;
4351 AM.BaseOffs = BaseOffset.getFixed();
4352 AM.HasBaseReg = HasBaseReg;
4353 AM.Scale = Scale;
4354 AM.ScalableOffset = BaseOffset.getScalable();
4355 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
4356 // Scale represents reg2 * scale, thus account for 1 if
4357 // it is not equal to 0 or 1.
4358 return AM.Scale != 0 && AM.Scale != 1;
4359 return -1;
4360}
4361
4363 // For the binary operators (e.g. or) we need to be more careful than
4364 // selects, here we only transform them if they are already at a natural
4365 // break point in the code - the end of a block with an unconditional
4366 // terminator.
4367 if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or &&
4368 isa<BranchInst>(I->getNextNode()) &&
4369 cast<BranchInst>(I->getNextNode())->isUnconditional())
4370 return true;
4372}
4373
4375 const TargetTransformInfo::LSRCost &C2) {
4376 // AArch64 specific here is adding the number of instructions to the
4377 // comparison (though not as the first consideration, as some targets do)
4378 // along with changing the priority of the base additions.
4379 // TODO: Maybe a more nuanced tradeoff between instruction count
4380 // and number of registers? To be investigated at a later date.
4381 if (EnableLSRCostOpt)
4382 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
4383 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
4384 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
4385 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
4386
4388}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
static std::optional< Instruction * > instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVEAllOrNoActive(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVENoActiveUnaryZero(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVENoActiveUnaryErase(InstCombiner &IC, IntrinsicInst &II, int PredPos)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE)
static std::optional< Instruction * > instCombineSVEAllActive(IntrinsicInst &II, Intrinsic::ID IID)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static InstructionCost getHistogramCost(const IntrinsicCostAttributes &ICA)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static bool hasPossibleIncompatibleOps(const Function *F)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfo::Concept conforming object specific to the AArch64 target machine.
amdgpu AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
uint64_t IntrinsicInst * II
#define P(N)
const char LLVMTargetMachineRef TM
static uint64_t getBits(uint64_t Val, int Start, int End)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getFastMathFlags(const MachineInstr &I)
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getVectorInsertExtractBaseCost() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned getMaxInterleaveFactor() const
bool isSVEorStreamingSVEAvailable() const
Returns true if the target has access to either the full range of SVE instructions,...
TailFoldingOpts getSVETailFoldingDefaultOpts() const
bool useSVEForFixedLengthVectors() const
unsigned getMinSVEVectorSizeInBits() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
InstructionCost getSpliceCost(VectorType *Tp, int Index)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
bool shouldTreatInstructionLikeSelect(const Instruction *I)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isElementTypeLegalForScalableVector(Type *Ty) const
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool useNeonVector(const Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLegalMaskedGatherScatter(Type *DataType) const
unsigned getMaxInterleaveFactor(ElementCount VF)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
Class for arbitrary precision integers.
Definition: APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:429
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1629
unsigned countLeadingOnes() const
Definition: APInt.h:1583
void negate()
Negate this APInt in place.
Definition: APInt.h:1430
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1719
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:807
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:420
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1522
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:61
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:588
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:975
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:768
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:660
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:897
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:861
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
Definition: BasicTTIImpl.h:340
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition: InstrTypes.h:246
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1465
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1410
unsigned arg_size() const
Definition: InstrTypes.h:1408
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:757
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:760
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:763
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:761
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:762
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:764
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:773
bool isIntPredicate() const
Definition: InstrTypes.h:865
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1650
This is the shared class of boolean and integer constants.
Definition: Constants.h:81
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition: Constants.h:206
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:146
static Constant * get(StructType *T, ArrayRef< Constant * > V)
Definition: Constants.cpp:1357
This is an important base class in LLVM.
Definition: Constant.h:42
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:370
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:314
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:311
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool allowContract() const
Definition: FMF.h:70
Container class for subtarget features.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:692
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:915
bool isEquality() const
Return true if this predicate is either EQ or NE.
Value * CreateVScale(Constant *Scaling, const Twine &Name="")
Create a call to llvm.vscale, multiplied by Scaling.
Definition: IRBuilder.cpp:89
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2477
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition: IRBuilder.h:2528
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1050
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2465
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:536
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition: IRBuilder.h:556
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1193
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:933
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:579
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1091
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:523
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition: IRBuilder.h:541
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:308
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:528
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition: IRBuilder.h:1871
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:488
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2210
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2402
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2132
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1795
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2499
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1808
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:599
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition: IRBuilder.h:551
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1671
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2201
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:177
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2671
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
The core instruction combiner logic.
Definition: InstCombiner.h:47
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:386
BuilderTy & Builder
Definition: InstCombiner.h:60
static InstructionCost getInvalid(CostType Val=0)
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:274
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
bool hasGroups() const
Returns true if we have any interleave groups.
Definition: VectorUtils.h:676
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:55
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:174
Value * getPointerOperand()
Definition: Instructions.h:253
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:44
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition: MapVector.h:60
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:662
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1852
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:71
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool requiresSMChange(const SMEAttrs &Callee) const
void set(unsigned M, bool Enable=true)
bool hasStreamingBody() const
bool isNewZA() const
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:713
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
Definition: DerivedTypes.h:627
The main scalar evolution driver.
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition: SmallPtrSet.h:94
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:818
void resize(size_type N)
Definition: SmallVector.h:651
void push_back(const T &Elt)
Definition: SmallVector.h:426
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1209
StackOffset holds a fixed and a scalable offset in bytes.
Definition: TypeSize.h:33
static StackOffset getScalable(int64_t Scalable)
Definition: TypeSize.h:43
static StackOffset getFixed(int64_t Fixed)
Definition: TypeSize.h:42
An instruction for storing to memory.
Definition: Instructions.h:290
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:685
A switch()-like statement whose cases are string literals.
Definition: StringSwitch.h:44
StringSwitch & Case(StringLiteral S, T Value)
Definition: StringSwitch.h:69
R Default(T Value)
Definition: StringSwitch.h:182
Class to represent struct types.
Definition: DerivedTypes.h:216
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
bool shouldTreatInstructionLikeSelect(const Instruction *I)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
bool isLoweredToCall(const Function *F) const
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:345
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:348
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
bool isPointerTy() const
True if this is an instance of PointerType.
Definition: Type.h:255
static IntegerType * getInt1Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:163
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:262
bool isScalableTy() const
Return true if this is a type whose size is a known multiple of vscale.
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:927
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:383
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:641
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:676
Type * getElementType() const
Definition: DerivedTypes.h:436
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:171
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition: TypeSize.h:168
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static constexpr unsigned SVEBitsPerBlock
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:778
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:246
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:818
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:397
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:931
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:802
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:958
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:755
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:673
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:733
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:808
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:916
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:864
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:708
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:897
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:814
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1513
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:100
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:972
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
Definition: PatternMatch.h:816
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:875
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
Definition: PatternMatch.h:560
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:592
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
Definition: PatternMatch.h:599
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:105
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:921
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:92
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
Definition: PatternMatch.h:612
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:443
LocationClass< Ty > location(Ty &L)
Definition: CommandLine.h:463
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition: STLExtras.h:329
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1722
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition: CostTable.h:35
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
Definition: STLExtras.h:2400
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
AddressSpace
Definition: NVPTXBaseInfo.h:21
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition: MathExtras.h:296
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1729
bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition: MathExtras.h:340
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
Definition: Error.cpp:167
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
constexpr int PoisonMaskElem
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ Mod
The access may modify the value stored in memory.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FAnyOf
Any_of reduction with select(fcmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Add
Sum of integers.
@ FAdd
Sum of floats.
@ IAnyOf
Any_of reduction with select(icmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
InstructionCost Cost
@ Default
The result values are uniform if and only if all operands are uniform.
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition: CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition: MathExtras.h:382
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Cost Table Entry.
Definition: CostTable.h:25
Extended Value Type.
Definition: ValueTypes.h:34
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:136
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition: ValueTypes.h:73
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition: ValueTypes.h:274
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition: ValueTypes.h:358
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:370
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition: ValueTypes.h:306
bool isFixedLengthVector() const
Definition: ValueTypes.h:177
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
Definition: ValueTypes.cpp:203
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition: ValueTypes.h:173
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition: ValueTypes.h:318
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition: ValueTypes.h:326
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition: Alignment.h:117
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Type Conversion Cost Table.
Definition: CostTable.h:55