LLVM 18.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
20#include "llvm/IR/Intrinsics.h"
21#include "llvm/IR/IntrinsicsAArch64.h"
23#include "llvm/Support/Debug.h"
26#include <algorithm>
27#include <optional>
28using namespace llvm;
29using namespace llvm::PatternMatch;
30
31#define DEBUG_TYPE "aarch64tti"
32
33static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
34 cl::init(true), cl::Hidden);
35
36static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
38
39static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
40 cl::init(10), cl::Hidden);
41
42static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
43 cl::init(15), cl::Hidden);
44
46 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
48
49namespace {
50class TailFoldingOption {
51 // These bitfields will only ever be set to something non-zero in operator=,
52 // when setting the -sve-tail-folding option. This option should always be of
53 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
54 // InitialBits is one of (disabled|all|simple). EnableBits represents
55 // additional flags we're enabling, and DisableBits for those flags we're
56 // disabling. The default flag is tracked in the variable NeedsDefault, since
57 // at the time of setting the option we may not know what the default value
58 // for the CPU is.
59 TailFoldingOpts InitialBits = TailFoldingOpts::Disabled;
60 TailFoldingOpts EnableBits = TailFoldingOpts::Disabled;
61 TailFoldingOpts DisableBits = TailFoldingOpts::Disabled;
62
63 // This value needs to be initialised to true in case the user does not
64 // explicitly set the -sve-tail-folding option.
65 bool NeedsDefault = true;
66
67 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
68
69 void setNeedsDefault(bool V) { NeedsDefault = V; }
70
71 void setEnableBit(TailFoldingOpts Bit) {
72 EnableBits |= Bit;
73 DisableBits &= ~Bit;
74 }
75
76 void setDisableBit(TailFoldingOpts Bit) {
77 EnableBits &= ~Bit;
78 DisableBits |= Bit;
79 }
80
81 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
82 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
83
84 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
85 "Initial bits should only include one of "
86 "(disabled|all|simple|default)");
87 Bits = NeedsDefault ? DefaultBits : InitialBits;
88 Bits |= EnableBits;
89 Bits &= ~DisableBits;
90
91 return Bits;
92 }
93
94 void reportError(std::string Opt) {
95 errs() << "invalid argument '" << Opt
96 << "' to -sve-tail-folding=; the option should be of the form\n"
97 " (disabled|all|default|simple)[+(reductions|recurrences"
98 "|reverse|noreductions|norecurrences|noreverse)]\n";
99 report_fatal_error("Unrecognised tail-folding option");
100 }
101
102public:
103
104 void operator=(const std::string &Val) {
105 // If the user explicitly sets -sve-tail-folding= then treat as an error.
106 if (Val.empty()) {
107 reportError("");
108 return;
109 }
110
111 // Since the user is explicitly setting the option we don't automatically
112 // need the default unless they require it.
113 setNeedsDefault(false);
114
115 SmallVector<StringRef, 4> TailFoldTypes;
116 StringRef(Val).split(TailFoldTypes, '+', -1, false);
117
118 unsigned StartIdx = 1;
119 if (TailFoldTypes[0] == "disabled")
120 setInitialBits(TailFoldingOpts::Disabled);
121 else if (TailFoldTypes[0] == "all")
122 setInitialBits(TailFoldingOpts::All);
123 else if (TailFoldTypes[0] == "default")
124 setNeedsDefault(true);
125 else if (TailFoldTypes[0] == "simple")
126 setInitialBits(TailFoldingOpts::Simple);
127 else {
128 StartIdx = 0;
129 setInitialBits(TailFoldingOpts::Disabled);
130 }
131
132 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
133 if (TailFoldTypes[I] == "reductions")
134 setEnableBit(TailFoldingOpts::Reductions);
135 else if (TailFoldTypes[I] == "recurrences")
136 setEnableBit(TailFoldingOpts::Recurrences);
137 else if (TailFoldTypes[I] == "reverse")
138 setEnableBit(TailFoldingOpts::Reverse);
139 else if (TailFoldTypes[I] == "noreductions")
140 setDisableBit(TailFoldingOpts::Reductions);
141 else if (TailFoldTypes[I] == "norecurrences")
142 setDisableBit(TailFoldingOpts::Recurrences);
143 else if (TailFoldTypes[I] == "noreverse")
144 setDisableBit(TailFoldingOpts::Reverse);
145 else
146 reportError(Val);
147 }
148 }
149
150 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
151 return (getBits(DefaultBits) & Required) == Required;
152 }
153};
154} // namespace
155
156TailFoldingOption TailFoldingOptionLoc;
157
159 "sve-tail-folding",
160 cl::desc(
161 "Control the use of vectorisation using tail-folding for SVE where the"
162 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
163 "\ndisabled (Initial) No loop types will vectorize using "
164 "tail-folding"
165 "\ndefault (Initial) Uses the default tail-folding settings for "
166 "the target CPU"
167 "\nall (Initial) All legal loop types will vectorize using "
168 "tail-folding"
169 "\nsimple (Initial) Use tail-folding for simple loops (not "
170 "reductions or recurrences)"
171 "\nreductions Use tail-folding for loops containing reductions"
172 "\nnoreductions Inverse of above"
173 "\nrecurrences Use tail-folding for loops containing fixed order "
174 "recurrences"
175 "\nnorecurrences Inverse of above"
176 "\nreverse Use tail-folding for loops requiring reversed "
177 "predicates"
178 "\nnoreverse Inverse of above"),
180
181// Experimental option that will only be fully functional when the
182// code-generator is changed to use SVE instead of NEON for all fixed-width
183// operations.
185 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
186
187// Experimental option that will only be fully functional when the cost-model
188// and code-generator have been changed to avoid using scalable vector
189// instructions that are not legal in streaming SVE mode.
191 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
192
194 const Function *Callee) const {
195 SMEAttrs CallerAttrs(*Caller);
196 SMEAttrs CalleeAttrs(*Callee);
197 if (CallerAttrs.requiresSMChange(CalleeAttrs,
198 /*BodyOverridesInterface=*/true) ||
199 CallerAttrs.requiresLazySave(CalleeAttrs) ||
200 CalleeAttrs.hasNewZABody())
201 return false;
202
203 const TargetMachine &TM = getTLI()->getTargetMachine();
204
205 const FeatureBitset &CallerBits =
206 TM.getSubtargetImpl(*Caller)->getFeatureBits();
207 const FeatureBitset &CalleeBits =
208 TM.getSubtargetImpl(*Callee)->getFeatureBits();
209
210 // Inline a callee if its target-features are a subset of the callers
211 // target-features.
212 return (CallerBits & CalleeBits) == CalleeBits;
213}
214
219 ST->isNeonAvailable());
220}
221
222/// Calculate the cost of materializing a 64-bit value. This helper
223/// method might only calculate a fraction of a larger immediate. Therefore it
224/// is valid to return a cost of ZERO.
226 // Check if the immediate can be encoded within an instruction.
227 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
228 return 0;
229
230 if (Val < 0)
231 Val = ~Val;
232
233 // Calculate how many moves we will need to materialize this constant.
236 return Insn.size();
237}
238
239/// Calculate the cost of materializing the given constant.
242 assert(Ty->isIntegerTy());
243
244 unsigned BitSize = Ty->getPrimitiveSizeInBits();
245 if (BitSize == 0)
246 return ~0U;
247
248 // Sign-extend all constants to a multiple of 64-bit.
249 APInt ImmVal = Imm;
250 if (BitSize & 0x3f)
251 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
252
253 // Split the constant into 64-bit chunks and calculate the cost for each
254 // chunk.
256 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
257 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
258 int64_t Val = Tmp.getSExtValue();
259 Cost += getIntImmCost(Val);
260 }
261 // We need at least one instruction to materialze the constant.
262 return std::max<InstructionCost>(1, Cost);
263}
264
266 const APInt &Imm, Type *Ty,
268 Instruction *Inst) {
269 assert(Ty->isIntegerTy());
270
271 unsigned BitSize = Ty->getPrimitiveSizeInBits();
272 // There is no cost model for constants with a bit size of 0. Return TCC_Free
273 // here, so that constant hoisting will ignore this constant.
274 if (BitSize == 0)
275 return TTI::TCC_Free;
276
277 unsigned ImmIdx = ~0U;
278 switch (Opcode) {
279 default:
280 return TTI::TCC_Free;
281 case Instruction::GetElementPtr:
282 // Always hoist the base address of a GetElementPtr.
283 if (Idx == 0)
284 return 2 * TTI::TCC_Basic;
285 return TTI::TCC_Free;
286 case Instruction::Store:
287 ImmIdx = 0;
288 break;
289 case Instruction::Add:
290 case Instruction::Sub:
291 case Instruction::Mul:
292 case Instruction::UDiv:
293 case Instruction::SDiv:
294 case Instruction::URem:
295 case Instruction::SRem:
296 case Instruction::And:
297 case Instruction::Or:
298 case Instruction::Xor:
299 case Instruction::ICmp:
300 ImmIdx = 1;
301 break;
302 // Always return TCC_Free for the shift value of a shift instruction.
303 case Instruction::Shl:
304 case Instruction::LShr:
305 case Instruction::AShr:
306 if (Idx == 1)
307 return TTI::TCC_Free;
308 break;
309 case Instruction::Trunc:
310 case Instruction::ZExt:
311 case Instruction::SExt:
312 case Instruction::IntToPtr:
313 case Instruction::PtrToInt:
314 case Instruction::BitCast:
315 case Instruction::PHI:
316 case Instruction::Call:
317 case Instruction::Select:
318 case Instruction::Ret:
319 case Instruction::Load:
320 break;
321 }
322
323 if (Idx == ImmIdx) {
324 int NumConstants = (BitSize + 63) / 64;
326 return (Cost <= NumConstants * TTI::TCC_Basic)
327 ? static_cast<int>(TTI::TCC_Free)
328 : Cost;
329 }
331}
332
335 const APInt &Imm, Type *Ty,
337 assert(Ty->isIntegerTy());
338
339 unsigned BitSize = Ty->getPrimitiveSizeInBits();
340 // There is no cost model for constants with a bit size of 0. Return TCC_Free
341 // here, so that constant hoisting will ignore this constant.
342 if (BitSize == 0)
343 return TTI::TCC_Free;
344
345 // Most (all?) AArch64 intrinsics do not support folding immediates into the
346 // selected instruction, so we compute the materialization cost for the
347 // immediate directly.
348 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
350
351 switch (IID) {
352 default:
353 return TTI::TCC_Free;
354 case Intrinsic::sadd_with_overflow:
355 case Intrinsic::uadd_with_overflow:
356 case Intrinsic::ssub_with_overflow:
357 case Intrinsic::usub_with_overflow:
358 case Intrinsic::smul_with_overflow:
359 case Intrinsic::umul_with_overflow:
360 if (Idx == 1) {
361 int NumConstants = (BitSize + 63) / 64;
363 return (Cost <= NumConstants * TTI::TCC_Basic)
364 ? static_cast<int>(TTI::TCC_Free)
365 : Cost;
366 }
367 break;
368 case Intrinsic::experimental_stackmap:
369 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
370 return TTI::TCC_Free;
371 break;
372 case Intrinsic::experimental_patchpoint_void:
373 case Intrinsic::experimental_patchpoint_i64:
374 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
375 return TTI::TCC_Free;
376 break;
377 case Intrinsic::experimental_gc_statepoint:
378 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
379 return TTI::TCC_Free;
380 break;
381 }
383}
384
387 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
388 if (TyWidth == 32 || TyWidth == 64)
390 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
391 return TTI::PSK_Software;
392}
393
397 auto *RetTy = ICA.getReturnType();
398 switch (ICA.getID()) {
399 case Intrinsic::umin:
400 case Intrinsic::umax:
401 case Intrinsic::smin:
402 case Intrinsic::smax: {
403 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
404 MVT::v8i16, MVT::v2i32, MVT::v4i32,
405 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
406 MVT::nxv2i64};
408 // v2i64 types get converted to cmp+bif hence the cost of 2
409 if (LT.second == MVT::v2i64)
410 return LT.first * 2;
411 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
412 return LT.first;
413 break;
414 }
415 case Intrinsic::sadd_sat:
416 case Intrinsic::ssub_sat:
417 case Intrinsic::uadd_sat:
418 case Intrinsic::usub_sat: {
419 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
420 MVT::v8i16, MVT::v2i32, MVT::v4i32,
421 MVT::v2i64};
423 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
424 // need to extend the type, as it uses shr(qadd(shl, shl)).
425 unsigned Instrs =
426 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
427 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
428 return LT.first * Instrs;
429 break;
430 }
431 case Intrinsic::abs: {
432 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
433 MVT::v8i16, MVT::v2i32, MVT::v4i32,
434 MVT::v2i64};
436 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
437 return LT.first;
438 break;
439 }
440 case Intrinsic::bswap: {
441 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
442 MVT::v4i32, MVT::v2i64};
444 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }) &&
445 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
446 return LT.first;
447 break;
448 }
449 case Intrinsic::experimental_stepvector: {
450 InstructionCost Cost = 1; // Cost of the `index' instruction
452 // Legalisation of illegal vectors involves an `index' instruction plus
453 // (LT.first - 1) vector adds.
454 if (LT.first > 1) {
455 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
456 InstructionCost AddCost =
457 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
458 Cost += AddCost * (LT.first - 1);
459 }
460 return Cost;
461 }
462 case Intrinsic::bitreverse: {
463 static const CostTblEntry BitreverseTbl[] = {
464 {Intrinsic::bitreverse, MVT::i32, 1},
465 {Intrinsic::bitreverse, MVT::i64, 1},
466 {Intrinsic::bitreverse, MVT::v8i8, 1},
467 {Intrinsic::bitreverse, MVT::v16i8, 1},
468 {Intrinsic::bitreverse, MVT::v4i16, 2},
469 {Intrinsic::bitreverse, MVT::v8i16, 2},
470 {Intrinsic::bitreverse, MVT::v2i32, 2},
471 {Intrinsic::bitreverse, MVT::v4i32, 2},
472 {Intrinsic::bitreverse, MVT::v1i64, 2},
473 {Intrinsic::bitreverse, MVT::v2i64, 2},
474 };
475 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
476 const auto *Entry =
477 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
478 if (Entry) {
479 // Cost Model is using the legal type(i32) that i8 and i16 will be
480 // converted to +1 so that we match the actual lowering cost
481 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
482 TLI->getValueType(DL, RetTy, true) == MVT::i16)
483 return LegalisationCost.first * Entry->Cost + 1;
484
485 return LegalisationCost.first * Entry->Cost;
486 }
487 break;
488 }
489 case Intrinsic::ctpop: {
490 if (!ST->hasNEON()) {
491 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
492 return getTypeLegalizationCost(RetTy).first * 12;
493 }
494 static const CostTblEntry CtpopCostTbl[] = {
495 {ISD::CTPOP, MVT::v2i64, 4},
496 {ISD::CTPOP, MVT::v4i32, 3},
497 {ISD::CTPOP, MVT::v8i16, 2},
498 {ISD::CTPOP, MVT::v16i8, 1},
499 {ISD::CTPOP, MVT::i64, 4},
500 {ISD::CTPOP, MVT::v2i32, 3},
501 {ISD::CTPOP, MVT::v4i16, 2},
502 {ISD::CTPOP, MVT::v8i8, 1},
503 {ISD::CTPOP, MVT::i32, 5},
504 };
506 MVT MTy = LT.second;
507 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
508 // Extra cost of +1 when illegal vector types are legalized by promoting
509 // the integer type.
510 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
511 RetTy->getScalarSizeInBits()
512 ? 1
513 : 0;
514 return LT.first * Entry->Cost + ExtraCost;
515 }
516 break;
517 }
518 case Intrinsic::sadd_with_overflow:
519 case Intrinsic::uadd_with_overflow:
520 case Intrinsic::ssub_with_overflow:
521 case Intrinsic::usub_with_overflow:
522 case Intrinsic::smul_with_overflow:
523 case Intrinsic::umul_with_overflow: {
524 static const CostTblEntry WithOverflowCostTbl[] = {
525 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
526 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
527 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
528 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
529 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
530 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
531 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
532 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
533 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
534 {Intrinsic::usub_with_overflow, MVT::i8, 3},
535 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
536 {Intrinsic::usub_with_overflow, MVT::i16, 3},
537 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
538 {Intrinsic::usub_with_overflow, MVT::i32, 1},
539 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
540 {Intrinsic::usub_with_overflow, MVT::i64, 1},
541 {Intrinsic::smul_with_overflow, MVT::i8, 5},
542 {Intrinsic::umul_with_overflow, MVT::i8, 4},
543 {Intrinsic::smul_with_overflow, MVT::i16, 5},
544 {Intrinsic::umul_with_overflow, MVT::i16, 4},
545 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
546 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
547 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
548 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
549 };
550 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
551 if (MTy.isSimple())
552 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
553 MTy.getSimpleVT()))
554 return Entry->Cost;
555 break;
556 }
557 case Intrinsic::fptosi_sat:
558 case Intrinsic::fptoui_sat: {
559 if (ICA.getArgTypes().empty())
560 break;
561 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
562 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
563 EVT MTy = TLI->getValueType(DL, RetTy);
564 // Check for the legal types, which are where the size of the input and the
565 // output are the same, or we are using cvt f64->i32 or f32->i64.
566 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
567 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
568 LT.second == MVT::v2f64) &&
569 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
570 (LT.second == MVT::f64 && MTy == MVT::i32) ||
571 (LT.second == MVT::f32 && MTy == MVT::i64)))
572 return LT.first;
573 // Similarly for fp16 sizes
574 if (ST->hasFullFP16() &&
575 ((LT.second == MVT::f16 && MTy == MVT::i32) ||
576 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
577 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
578 return LT.first;
579
580 // Otherwise we use a legal convert followed by a min+max
581 if ((LT.second.getScalarType() == MVT::f32 ||
582 LT.second.getScalarType() == MVT::f64 ||
583 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
584 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
585 Type *LegalTy =
586 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
587 if (LT.second.isVector())
588 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
590 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
591 LegalTy, {LegalTy, LegalTy});
593 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
594 LegalTy, {LegalTy, LegalTy});
596 return LT.first * Cost;
597 }
598 break;
599 }
600 case Intrinsic::fshl:
601 case Intrinsic::fshr: {
602 if (ICA.getArgs().empty())
603 break;
604
605 // TODO: Add handling for fshl where third argument is not a constant.
606 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
607 if (!OpInfoZ.isConstant())
608 break;
609
610 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
611 if (OpInfoZ.isUniform()) {
612 // FIXME: The costs could be lower if the codegen is better.
613 static const CostTblEntry FshlTbl[] = {
614 {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
615 {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
616 {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
617 {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}};
618 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
619 // to avoid having to duplicate the costs.
620 const auto *Entry =
621 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
622 if (Entry)
623 return LegalisationCost.first * Entry->Cost;
624 }
625
626 auto TyL = getTypeLegalizationCost(RetTy);
627 if (!RetTy->isIntegerTy())
628 break;
629
630 // Estimate cost manually, as types like i8 and i16 will get promoted to
631 // i32 and CostTableLookup will ignore the extra conversion cost.
632 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
633 RetTy->getScalarSizeInBits() < 64) ||
634 (RetTy->getScalarSizeInBits() % 64 != 0);
635 unsigned ExtraCost = HigherCost ? 1 : 0;
636 if (RetTy->getScalarSizeInBits() == 32 ||
637 RetTy->getScalarSizeInBits() == 64)
638 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
639 // extr instruction.
640 else if (HigherCost)
641 ExtraCost = 1;
642 else
643 break;
644 return TyL.first + ExtraCost;
645 }
646 default:
647 break;
648 }
650}
651
652/// The function will remove redundant reinterprets casting in the presence
653/// of the control flow
654static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
655 IntrinsicInst &II) {
657 auto RequiredType = II.getType();
658
659 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
660 assert(PN && "Expected Phi Node!");
661
662 // Don't create a new Phi unless we can remove the old one.
663 if (!PN->hasOneUse())
664 return std::nullopt;
665
666 for (Value *IncValPhi : PN->incoming_values()) {
667 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
668 if (!Reinterpret ||
669 Reinterpret->getIntrinsicID() !=
670 Intrinsic::aarch64_sve_convert_to_svbool ||
671 RequiredType != Reinterpret->getArgOperand(0)->getType())
672 return std::nullopt;
673 }
674
675 // Create the new Phi
676 IC.Builder.SetInsertPoint(PN);
677 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
678 Worklist.push_back(PN);
679
680 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
681 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
682 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
683 Worklist.push_back(Reinterpret);
684 }
685
686 // Cleanup Phi Node and reinterprets
687 return IC.replaceInstUsesWith(II, NPN);
688}
689
690// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
691// => (binop (pred) (from_svbool _) (from_svbool _))
692//
693// The above transformation eliminates a `to_svbool` in the predicate
694// operand of bitwise operation `binop` by narrowing the vector width of
695// the operation. For example, it would convert a `<vscale x 16 x i1>
696// and` into a `<vscale x 4 x i1> and`. This is profitable because
697// to_svbool must zero the new lanes during widening, whereas
698// from_svbool is free.
699static std::optional<Instruction *>
701 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
702 if (!BinOp)
703 return std::nullopt;
704
705 auto IntrinsicID = BinOp->getIntrinsicID();
706 switch (IntrinsicID) {
707 case Intrinsic::aarch64_sve_and_z:
708 case Intrinsic::aarch64_sve_bic_z:
709 case Intrinsic::aarch64_sve_eor_z:
710 case Intrinsic::aarch64_sve_nand_z:
711 case Intrinsic::aarch64_sve_nor_z:
712 case Intrinsic::aarch64_sve_orn_z:
713 case Intrinsic::aarch64_sve_orr_z:
714 break;
715 default:
716 return std::nullopt;
717 }
718
719 auto BinOpPred = BinOp->getOperand(0);
720 auto BinOpOp1 = BinOp->getOperand(1);
721 auto BinOpOp2 = BinOp->getOperand(2);
722
723 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
724 if (!PredIntr ||
725 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
726 return std::nullopt;
727
728 auto PredOp = PredIntr->getOperand(0);
729 auto PredOpTy = cast<VectorType>(PredOp->getType());
730 if (PredOpTy != II.getType())
731 return std::nullopt;
732
733 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
734 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
735 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
736 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
737 if (BinOpOp1 == BinOpOp2)
738 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
739 else
740 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
741 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
742
743 auto NarrowedBinOp =
744 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
745 return IC.replaceInstUsesWith(II, NarrowedBinOp);
746}
747
748static std::optional<Instruction *>
750 // If the reinterpret instruction operand is a PHI Node
751 if (isa<PHINode>(II.getArgOperand(0)))
752 return processPhiNode(IC, II);
753
754 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
755 return BinOpCombine;
756
757 // Ignore converts to/from svcount_t.
758 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
759 isa<TargetExtType>(II.getType()))
760 return std::nullopt;
761
762 SmallVector<Instruction *, 32> CandidatesForRemoval;
763 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
764
765 const auto *IVTy = cast<VectorType>(II.getType());
766
767 // Walk the chain of conversions.
768 while (Cursor) {
769 // If the type of the cursor has fewer lanes than the final result, zeroing
770 // must take place, which breaks the equivalence chain.
771 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
772 if (CursorVTy->getElementCount().getKnownMinValue() <
773 IVTy->getElementCount().getKnownMinValue())
774 break;
775
776 // If the cursor has the same type as I, it is a viable replacement.
777 if (Cursor->getType() == IVTy)
778 EarliestReplacement = Cursor;
779
780 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
781
782 // If this is not an SVE conversion intrinsic, this is the end of the chain.
783 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
784 Intrinsic::aarch64_sve_convert_to_svbool ||
785 IntrinsicCursor->getIntrinsicID() ==
786 Intrinsic::aarch64_sve_convert_from_svbool))
787 break;
788
789 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
790 Cursor = IntrinsicCursor->getOperand(0);
791 }
792
793 // If no viable replacement in the conversion chain was found, there is
794 // nothing to do.
795 if (!EarliestReplacement)
796 return std::nullopt;
797
798 return IC.replaceInstUsesWith(II, EarliestReplacement);
799}
800
801static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
802 IntrinsicInst &II) {
803 auto Select = IC.Builder.CreateSelect(II.getOperand(0), II.getOperand(1),
804 II.getOperand(2));
805 return IC.replaceInstUsesWith(II, Select);
806}
807
808static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
809 IntrinsicInst &II) {
810 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
811 if (!Pg)
812 return std::nullopt;
813
814 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
815 return std::nullopt;
816
817 const auto PTruePattern =
818 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
819 if (PTruePattern != AArch64SVEPredPattern::vl1)
820 return std::nullopt;
821
822 // The intrinsic is inserting into lane zero so use an insert instead.
823 auto *IdxTy = Type::getInt64Ty(II.getContext());
824 auto *Insert = InsertElementInst::Create(
825 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
826 Insert->insertBefore(&II);
827 Insert->takeName(&II);
828
829 return IC.replaceInstUsesWith(II, Insert);
830}
831
832static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
833 IntrinsicInst &II) {
834 // Replace DupX with a regular IR splat.
835 auto *RetTy = cast<ScalableVectorType>(II.getType());
836 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
837 II.getArgOperand(0));
838 Splat->takeName(&II);
839 return IC.replaceInstUsesWith(II, Splat);
840}
841
842static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
843 IntrinsicInst &II) {
844 LLVMContext &Ctx = II.getContext();
845
846 // Check that the predicate is all active
847 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
848 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
849 return std::nullopt;
850
851 const auto PTruePattern =
852 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
853 if (PTruePattern != AArch64SVEPredPattern::all)
854 return std::nullopt;
855
856 // Check that we have a compare of zero..
857 auto *SplatValue =
858 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
859 if (!SplatValue || !SplatValue->isZero())
860 return std::nullopt;
861
862 // ..against a dupq
863 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
864 if (!DupQLane ||
865 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
866 return std::nullopt;
867
868 // Where the dupq is a lane 0 replicate of a vector insert
869 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
870 return std::nullopt;
871
872 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
873 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
874 return std::nullopt;
875
876 // Where the vector insert is a fixed constant vector insert into undef at
877 // index zero
878 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
879 return std::nullopt;
880
881 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
882 return std::nullopt;
883
884 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
885 if (!ConstVec)
886 return std::nullopt;
887
888 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
889 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
890 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
891 return std::nullopt;
892
893 unsigned NumElts = VecTy->getNumElements();
894 unsigned PredicateBits = 0;
895
896 // Expand intrinsic operands to a 16-bit byte level predicate
897 for (unsigned I = 0; I < NumElts; ++I) {
898 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
899 if (!Arg)
900 return std::nullopt;
901 if (!Arg->isZero())
902 PredicateBits |= 1 << (I * (16 / NumElts));
903 }
904
905 // If all bits are zero bail early with an empty predicate
906 if (PredicateBits == 0) {
907 auto *PFalse = Constant::getNullValue(II.getType());
908 PFalse->takeName(&II);
909 return IC.replaceInstUsesWith(II, PFalse);
910 }
911
912 // Calculate largest predicate type used (where byte predicate is largest)
913 unsigned Mask = 8;
914 for (unsigned I = 0; I < 16; ++I)
915 if ((PredicateBits & (1 << I)) != 0)
916 Mask |= (I % 8);
917
918 unsigned PredSize = Mask & -Mask;
919 auto *PredType = ScalableVectorType::get(
920 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
921
922 // Ensure all relevant bits are set
923 for (unsigned I = 0; I < 16; I += PredSize)
924 if ((PredicateBits & (1 << I)) == 0)
925 return std::nullopt;
926
927 auto *PTruePat =
928 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
929 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
930 {PredType}, {PTruePat});
931 auto *ConvertToSVBool = IC.Builder.CreateIntrinsic(
932 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
933 auto *ConvertFromSVBool =
934 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
935 {II.getType()}, {ConvertToSVBool});
936
937 ConvertFromSVBool->takeName(&II);
938 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
939}
940
941static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
942 IntrinsicInst &II) {
943 Value *Pg = II.getArgOperand(0);
944 Value *Vec = II.getArgOperand(1);
945 auto IntrinsicID = II.getIntrinsicID();
946 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
947
948 // lastX(splat(X)) --> X
949 if (auto *SplatVal = getSplatValue(Vec))
950 return IC.replaceInstUsesWith(II, SplatVal);
951
952 // If x and/or y is a splat value then:
953 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
954 Value *LHS, *RHS;
955 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
956 if (isSplatValue(LHS) || isSplatValue(RHS)) {
957 auto *OldBinOp = cast<BinaryOperator>(Vec);
958 auto OpC = OldBinOp->getOpcode();
959 auto *NewLHS =
960 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
961 auto *NewRHS =
962 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
964 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
965 return IC.replaceInstUsesWith(II, NewBinOp);
966 }
967 }
968
969 auto *C = dyn_cast<Constant>(Pg);
970 if (IsAfter && C && C->isNullValue()) {
971 // The intrinsic is extracting lane 0 so use an extract instead.
972 auto *IdxTy = Type::getInt64Ty(II.getContext());
973 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
974 Extract->insertBefore(&II);
975 Extract->takeName(&II);
976 return IC.replaceInstUsesWith(II, Extract);
977 }
978
979 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
980 if (!IntrPG)
981 return std::nullopt;
982
983 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
984 return std::nullopt;
985
986 const auto PTruePattern =
987 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
988
989 // Can the intrinsic's predicate be converted to a known constant index?
990 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
991 if (!MinNumElts)
992 return std::nullopt;
993
994 unsigned Idx = MinNumElts - 1;
995 // Increment the index if extracting the element after the last active
996 // predicate element.
997 if (IsAfter)
998 ++Idx;
999
1000 // Ignore extracts whose index is larger than the known minimum vector
1001 // length. NOTE: This is an artificial constraint where we prefer to
1002 // maintain what the user asked for until an alternative is proven faster.
1003 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
1004 if (Idx >= PgVTy->getMinNumElements())
1005 return std::nullopt;
1006
1007 // The intrinsic is extracting a fixed lane so use an extract instead.
1008 auto *IdxTy = Type::getInt64Ty(II.getContext());
1009 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
1010 Extract->insertBefore(&II);
1011 Extract->takeName(&II);
1012 return IC.replaceInstUsesWith(II, Extract);
1013}
1014
1015static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
1016 IntrinsicInst &II) {
1017 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
1018 // integer variant across a variety of micro-architectures. Replace scalar
1019 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
1020 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
1021 // depending on the micro-architecture, but has been observed as generally
1022 // being faster, particularly when the CLAST[AB] op is a loop-carried
1023 // dependency.
1024 Value *Pg = II.getArgOperand(0);
1025 Value *Fallback = II.getArgOperand(1);
1026 Value *Vec = II.getArgOperand(2);
1027 Type *Ty = II.getType();
1028
1029 if (!Ty->isIntegerTy())
1030 return std::nullopt;
1031
1032 Type *FPTy;
1033 switch (cast<IntegerType>(Ty)->getBitWidth()) {
1034 default:
1035 return std::nullopt;
1036 case 16:
1037 FPTy = IC.Builder.getHalfTy();
1038 break;
1039 case 32:
1040 FPTy = IC.Builder.getFloatTy();
1041 break;
1042 case 64:
1043 FPTy = IC.Builder.getDoubleTy();
1044 break;
1045 }
1046
1047 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
1048 auto *FPVTy = VectorType::get(
1049 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
1050 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
1051 auto *FPII = IC.Builder.CreateIntrinsic(
1052 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
1053 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
1054 return IC.replaceInstUsesWith(II, FPIItoInt);
1055}
1056
1057static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
1058 IntrinsicInst &II) {
1059 LLVMContext &Ctx = II.getContext();
1060 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
1061 // can work with RDFFR_PP for ptest elimination.
1062 auto *AllPat =
1063 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
1064 auto *PTrue = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
1065 {II.getType()}, {AllPat});
1066 auto *RDFFR =
1067 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
1068 RDFFR->takeName(&II);
1069 return IC.replaceInstUsesWith(II, RDFFR);
1070}
1071
1072static std::optional<Instruction *>
1074 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
1075
1076 if (Pattern == AArch64SVEPredPattern::all) {
1077 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
1078 auto *VScale = IC.Builder.CreateVScale(StepVal);
1079 VScale->takeName(&II);
1080 return IC.replaceInstUsesWith(II, VScale);
1081 }
1082
1083 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
1084
1085 return MinNumElts && NumElts >= MinNumElts
1086 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
1087 II, ConstantInt::get(II.getType(), MinNumElts)))
1088 : std::nullopt;
1089}
1090
1091static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
1092 IntrinsicInst &II) {
1093 Value *PgVal = II.getArgOperand(0);
1094 Value *OpVal = II.getArgOperand(1);
1095
1096 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
1097 // Later optimizations prefer this form.
1098 if (PgVal == OpVal &&
1099 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
1100 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
1101 Value *Ops[] = {PgVal, OpVal};
1102 Type *Tys[] = {PgVal->getType()};
1103
1104 auto *PTest =
1105 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
1106 PTest->takeName(&II);
1107
1108 return IC.replaceInstUsesWith(II, PTest);
1109 }
1110
1111 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1112 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1113
1114 if (!Pg || !Op)
1115 return std::nullopt;
1116
1117 Intrinsic::ID OpIID = Op->getIntrinsicID();
1118
1119 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1120 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1121 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1122 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1123 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1124
1125 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1126
1127 PTest->takeName(&II);
1128 return IC.replaceInstUsesWith(II, PTest);
1129 }
1130
1131 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1132 // Later optimizations may rewrite sequence to use the flag-setting variant
1133 // of instruction X to remove PTEST.
1134 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1135 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1136 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1137 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1138 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1139 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1140 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1141 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1142 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1143 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1144 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1145 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1146 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1147 Value *Ops[] = {Pg->getArgOperand(0), Pg};
1148 Type *Tys[] = {Pg->getType()};
1149
1150 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1151 PTest->takeName(&II);
1152
1153 return IC.replaceInstUsesWith(II, PTest);
1154 }
1155
1156 return std::nullopt;
1157}
1158
1159template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1160static std::optional<Instruction *>
1162 bool MergeIntoAddendOp) {
1163 Value *P = II.getOperand(0);
1164 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1165 if (MergeIntoAddendOp) {
1166 AddendOp = II.getOperand(1);
1167 Mul = II.getOperand(2);
1168 } else {
1169 AddendOp = II.getOperand(2);
1170 Mul = II.getOperand(1);
1171 }
1172
1173 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1174 m_Value(MulOp1))))
1175 return std::nullopt;
1176
1177 if (!Mul->hasOneUse())
1178 return std::nullopt;
1179
1180 Instruction *FMFSource = nullptr;
1181 if (II.getType()->isFPOrFPVectorTy()) {
1182 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1183 // Stop the combine when the flags on the inputs differ in case dropping
1184 // flags would lead to us missing out on more beneficial optimizations.
1185 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1186 return std::nullopt;
1187 if (!FAddFlags.allowContract())
1188 return std::nullopt;
1189 FMFSource = &II;
1190 }
1191
1192 CallInst *Res;
1193 if (MergeIntoAddendOp)
1194 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1195 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1196 else
1197 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1198 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1199
1200 return IC.replaceInstUsesWith(II, Res);
1201}
1202
1203static bool isAllActivePredicate(Value *Pred) {
1204 // Look through convert.from.svbool(convert.to.svbool(...) chain.
1205 Value *UncastedPred;
1206 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1207 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1208 m_Value(UncastedPred)))))
1209 // If the predicate has the same or less lanes than the uncasted
1210 // predicate then we know the casting has no effect.
1211 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1212 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1213 Pred = UncastedPred;
1214
1215 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1216 m_ConstantInt<AArch64SVEPredPattern::all>()));
1217}
1218
1219static std::optional<Instruction *>
1221 Value *Pred = II.getOperand(0);
1222 Value *PtrOp = II.getOperand(1);
1223 Type *VecTy = II.getType();
1224
1225 if (isAllActivePredicate(Pred)) {
1226 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
1227 Load->copyMetadata(II);
1228 return IC.replaceInstUsesWith(II, Load);
1229 }
1230
1231 CallInst *MaskedLoad =
1232 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
1233 Pred, ConstantAggregateZero::get(VecTy));
1234 MaskedLoad->copyMetadata(II);
1235 return IC.replaceInstUsesWith(II, MaskedLoad);
1236}
1237
1238static std::optional<Instruction *>
1240 Value *VecOp = II.getOperand(0);
1241 Value *Pred = II.getOperand(1);
1242 Value *PtrOp = II.getOperand(2);
1243
1244 if (isAllActivePredicate(Pred)) {
1245 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
1246 Store->copyMetadata(II);
1247 return IC.eraseInstFromFunction(II);
1248 }
1249
1250 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
1251 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
1252 MaskedStore->copyMetadata(II);
1253 return IC.eraseInstFromFunction(II);
1254}
1255
1257 switch (Intrinsic) {
1258 case Intrinsic::aarch64_sve_fmul_u:
1259 return Instruction::BinaryOps::FMul;
1260 case Intrinsic::aarch64_sve_fadd_u:
1261 return Instruction::BinaryOps::FAdd;
1262 case Intrinsic::aarch64_sve_fsub_u:
1263 return Instruction::BinaryOps::FSub;
1264 default:
1265 return Instruction::BinaryOpsEnd;
1266 }
1267}
1268
1269static std::optional<Instruction *>
1271 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
1272 if (II.isStrictFP())
1273 return std::nullopt;
1274
1275 auto *OpPredicate = II.getOperand(0);
1276 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1277 if (BinOpCode == Instruction::BinaryOpsEnd ||
1278 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1279 m_ConstantInt<AArch64SVEPredPattern::all>())))
1280 return std::nullopt;
1283 auto BinOp =
1284 IC.Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
1285 return IC.replaceInstUsesWith(II, BinOp);
1286}
1287
1288// Canonicalise operations that take an all active predicate (e.g. sve.add ->
1289// sve.add_u).
1290static std::optional<Instruction *> instCombineSVEAllActive(IntrinsicInst &II,
1291 Intrinsic::ID IID) {
1292 auto *OpPredicate = II.getOperand(0);
1293 if (!match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1294 m_ConstantInt<AArch64SVEPredPattern::all>())))
1295 return std::nullopt;
1296
1297 auto *Mod = II.getModule();
1298 auto *NewDecl = Intrinsic::getDeclaration(Mod, IID, {II.getType()});
1299 II.setCalledFunction(NewDecl);
1300
1301 return &II;
1302}
1303
1304static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1305 IntrinsicInst &II) {
1306 if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_add_u))
1307 return II_U;
1308 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1309 Intrinsic::aarch64_sve_mla>(
1310 IC, II, true))
1311 return MLA;
1312 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1313 Intrinsic::aarch64_sve_mad>(
1314 IC, II, false))
1315 return MAD;
1316 return std::nullopt;
1317}
1318
1319static std::optional<Instruction *>
1321 if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fadd_u))
1322 return II_U;
1323 if (auto FMLA =
1324 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1325 Intrinsic::aarch64_sve_fmla>(IC, II,
1326 true))
1327 return FMLA;
1328 if (auto FMAD =
1329 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1330 Intrinsic::aarch64_sve_fmad>(IC, II,
1331 false))
1332 return FMAD;
1333 if (auto FMLA =
1334 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1335 Intrinsic::aarch64_sve_fmla>(IC, II,
1336 true))
1337 return FMLA;
1338 return std::nullopt;
1339}
1340
1341static std::optional<Instruction *>
1343 if (auto FMLA =
1344 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1345 Intrinsic::aarch64_sve_fmla>(IC, II,
1346 true))
1347 return FMLA;
1348 if (auto FMAD =
1349 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1350 Intrinsic::aarch64_sve_fmad>(IC, II,
1351 false))
1352 return FMAD;
1353 if (auto FMLA_U =
1354 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1355 Intrinsic::aarch64_sve_fmla_u>(
1356 IC, II, true))
1357 return FMLA_U;
1358 return instCombineSVEVectorBinOp(IC, II);
1359}
1360
1361static std::optional<Instruction *>
1363 if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fsub_u))
1364 return II_U;
1365 if (auto FMLS =
1366 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1367 Intrinsic::aarch64_sve_fmls>(IC, II,
1368 true))
1369 return FMLS;
1370 if (auto FMSB =
1371 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1372 Intrinsic::aarch64_sve_fnmsb>(
1373 IC, II, false))
1374 return FMSB;
1375 if (auto FMLS =
1376 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1377 Intrinsic::aarch64_sve_fmls>(IC, II,
1378 true))
1379 return FMLS;
1380 return std::nullopt;
1381}
1382
1383static std::optional<Instruction *>
1385 if (auto FMLS =
1386 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1387 Intrinsic::aarch64_sve_fmls>(IC, II,
1388 true))
1389 return FMLS;
1390 if (auto FMSB =
1391 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1392 Intrinsic::aarch64_sve_fnmsb>(
1393 IC, II, false))
1394 return FMSB;
1395 if (auto FMLS_U =
1396 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
1397 Intrinsic::aarch64_sve_fmls_u>(
1398 IC, II, true))
1399 return FMLS_U;
1400 return instCombineSVEVectorBinOp(IC, II);
1401}
1402
1403static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1404 IntrinsicInst &II) {
1405 if (auto II_U = instCombineSVEAllActive(II, Intrinsic::aarch64_sve_sub_u))
1406 return II_U;
1407 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1408 Intrinsic::aarch64_sve_mls>(
1409 IC, II, true))
1410 return MLS;
1411 return std::nullopt;
1412}
1413
1414static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1415 IntrinsicInst &II,
1416 Intrinsic::ID IID) {
1417 auto *OpPredicate = II.getOperand(0);
1418 auto *OpMultiplicand = II.getOperand(1);
1419 auto *OpMultiplier = II.getOperand(2);
1420
1421 // Canonicalise a non _u intrinsic only.
1422 if (II.getIntrinsicID() != IID)
1423 if (auto II_U = instCombineSVEAllActive(II, IID))
1424 return II_U;
1425
1426 // Return true if a given instruction is a unit splat value, false otherwise.
1427 auto IsUnitSplat = [](auto *I) {
1428 auto *SplatValue = getSplatValue(I);
1429 if (!SplatValue)
1430 return false;
1431 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1432 };
1433
1434 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1435 // with a unit splat value, false otherwise.
1436 auto IsUnitDup = [](auto *I) {
1437 auto *IntrI = dyn_cast<IntrinsicInst>(I);
1438 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1439 return false;
1440
1441 auto *SplatValue = IntrI->getOperand(2);
1442 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1443 };
1444
1445 if (IsUnitSplat(OpMultiplier)) {
1446 // [f]mul pg %n, (dupx 1) => %n
1447 OpMultiplicand->takeName(&II);
1448 return IC.replaceInstUsesWith(II, OpMultiplicand);
1449 } else if (IsUnitDup(OpMultiplier)) {
1450 // [f]mul pg %n, (dup pg 1) => %n
1451 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1452 auto *DupPg = DupInst->getOperand(1);
1453 // TODO: this is naive. The optimization is still valid if DupPg
1454 // 'encompasses' OpPredicate, not only if they're the same predicate.
1455 if (OpPredicate == DupPg) {
1456 OpMultiplicand->takeName(&II);
1457 return IC.replaceInstUsesWith(II, OpMultiplicand);
1458 }
1459 }
1460
1461 return instCombineSVEVectorBinOp(IC, II);
1462}
1463
1464static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1465 IntrinsicInst &II) {
1466 Value *UnpackArg = II.getArgOperand(0);
1467 auto *RetTy = cast<ScalableVectorType>(II.getType());
1468 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1469 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1470
1471 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1472 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1473 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1474 ScalarArg =
1475 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1476 Value *NewVal =
1477 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1478 NewVal->takeName(&II);
1479 return IC.replaceInstUsesWith(II, NewVal);
1480 }
1481
1482 return std::nullopt;
1483}
1484static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1485 IntrinsicInst &II) {
1486 auto *OpVal = II.getOperand(0);
1487 auto *OpIndices = II.getOperand(1);
1488 VectorType *VTy = cast<VectorType>(II.getType());
1489
1490 // Check whether OpIndices is a constant splat value < minimal element count
1491 // of result.
1492 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1493 if (!SplatValue ||
1494 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1495 return std::nullopt;
1496
1497 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1498 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1499 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
1500 auto *VectorSplat =
1501 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1502
1503 VectorSplat->takeName(&II);
1504 return IC.replaceInstUsesWith(II, VectorSplat);
1505}
1506
1507static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1508 IntrinsicInst &II) {
1509 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1510 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1511 Value *A, *B;
1512 if (match(II.getArgOperand(0),
1513 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1514 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1515 m_Specific(A), m_Specific(B))))
1516 return IC.replaceInstUsesWith(
1517 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1518
1519 return std::nullopt;
1520}
1521
1522static std::optional<Instruction *>
1524 Value *Mask = II.getOperand(0);
1525 Value *BasePtr = II.getOperand(1);
1526 Value *Index = II.getOperand(2);
1527 Type *Ty = II.getType();
1528 Value *PassThru = ConstantAggregateZero::get(Ty);
1529
1530 // Contiguous gather => masked load.
1531 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1532 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1533 Value *IndexBase;
1534 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1535 m_Value(IndexBase), m_SpecificInt(1)))) {
1536 Align Alignment =
1537 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1538
1539 Type *VecPtrTy = PointerType::getUnqual(Ty);
1540 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1541 BasePtr, IndexBase);
1542 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1543 CallInst *MaskedLoad =
1544 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1545 MaskedLoad->takeName(&II);
1546 return IC.replaceInstUsesWith(II, MaskedLoad);
1547 }
1548
1549 return std::nullopt;
1550}
1551
1552static std::optional<Instruction *>
1554 Value *Val = II.getOperand(0);
1555 Value *Mask = II.getOperand(1);
1556 Value *BasePtr = II.getOperand(2);
1557 Value *Index = II.getOperand(3);
1558 Type *Ty = Val->getType();
1559
1560 // Contiguous scatter => masked store.
1561 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1562 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1563 Value *IndexBase;
1564 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1565 m_Value(IndexBase), m_SpecificInt(1)))) {
1566 Align Alignment =
1567 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1568
1569 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1570 BasePtr, IndexBase);
1571 Type *VecPtrTy = PointerType::getUnqual(Ty);
1572 Ptr = IC.Builder.CreateBitCast(Ptr, VecPtrTy);
1573
1574 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1575
1576 return IC.eraseInstFromFunction(II);
1577 }
1578
1579 return std::nullopt;
1580}
1581
1582static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1583 IntrinsicInst &II) {
1585 Value *Pred = II.getOperand(0);
1586 Value *Vec = II.getOperand(1);
1587 Value *DivVec = II.getOperand(2);
1588
1589 Value *SplatValue = getSplatValue(DivVec);
1590 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1591 if (!SplatConstantInt)
1592 return std::nullopt;
1593 APInt Divisor = SplatConstantInt->getValue();
1594
1595 if (Divisor.isPowerOf2()) {
1596 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1597 auto ASRD = IC.Builder.CreateIntrinsic(
1598 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1599 return IC.replaceInstUsesWith(II, ASRD);
1600 }
1601 if (Divisor.isNegatedPowerOf2()) {
1602 Divisor.negate();
1603 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1604 auto ASRD = IC.Builder.CreateIntrinsic(
1605 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1606 auto NEG = IC.Builder.CreateIntrinsic(
1607 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
1608 return IC.replaceInstUsesWith(II, NEG);
1609 }
1610
1611 return std::nullopt;
1612}
1613
1614bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
1615 size_t VecSize = Vec.size();
1616 if (VecSize == 1)
1617 return true;
1618 if (!isPowerOf2_64(VecSize))
1619 return false;
1620 size_t HalfVecSize = VecSize / 2;
1621
1622 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
1623 RHS != Vec.end(); LHS++, RHS++) {
1624 if (*LHS != nullptr && *RHS != nullptr) {
1625 if (*LHS == *RHS)
1626 continue;
1627 else
1628 return false;
1629 }
1630 if (!AllowPoison)
1631 return false;
1632 if (*LHS == nullptr && *RHS != nullptr)
1633 *LHS = *RHS;
1634 }
1635
1636 Vec.resize(HalfVecSize);
1637 SimplifyValuePattern(Vec, AllowPoison);
1638 return true;
1639}
1640
1641// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
1642// to dupqlane(f64(C)) where C is A concatenated with B
1643static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
1644 IntrinsicInst &II) {
1645 Value *CurrentInsertElt = nullptr, *Default = nullptr;
1646 if (!match(II.getOperand(0),
1647 m_Intrinsic<Intrinsic::vector_insert>(
1648 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
1649 !isa<FixedVectorType>(CurrentInsertElt->getType()))
1650 return std::nullopt;
1651 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
1652
1653 // Insert the scalars into a container ordered by InsertElement index
1654 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
1655 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
1656 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
1657 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
1658 CurrentInsertElt = InsertElt->getOperand(0);
1659 }
1660
1661 bool AllowPoison =
1662 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
1663 if (!SimplifyValuePattern(Elts, AllowPoison))
1664 return std::nullopt;
1665
1666 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
1667 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
1668 for (size_t I = 0; I < Elts.size(); I++) {
1669 if (Elts[I] == nullptr)
1670 continue;
1671 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
1672 IC.Builder.getInt64(I));
1673 }
1674 if (InsertEltChain == nullptr)
1675 return std::nullopt;
1676
1677 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
1678 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
1679 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
1680 // be narrowed back to the original type.
1681 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
1682 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
1683 IIScalableTy->getMinNumElements() /
1684 PatternWidth;
1685
1686 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
1687 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
1688 auto *WideShuffleMaskTy =
1689 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
1690
1691 auto ZeroIdx = ConstantInt::get(IC.Builder.getInt64Ty(), APInt(64, 0));
1692 auto InsertSubvector = IC.Builder.CreateInsertVector(
1693 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
1694 auto WideBitcast =
1695 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
1696 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
1697 auto WideShuffle = IC.Builder.CreateShuffleVector(
1698 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
1699 auto NarrowBitcast =
1700 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
1701
1702 return IC.replaceInstUsesWith(II, NarrowBitcast);
1703}
1704
1705static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
1706 IntrinsicInst &II) {
1707 Value *A = II.getArgOperand(0);
1708 Value *B = II.getArgOperand(1);
1709 if (A == B)
1710 return IC.replaceInstUsesWith(II, A);
1711
1712 return std::nullopt;
1713}
1714
1715static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
1716 IntrinsicInst &II) {
1717 Value *Pred = II.getOperand(0);
1718 Value *Vec = II.getOperand(1);
1719 Value *Shift = II.getOperand(2);
1720
1721 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
1722 Value *AbsPred, *MergedValue;
1723 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
1724 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
1725 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
1726 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
1727
1728 return std::nullopt;
1729
1730 // Transform is valid if any of the following are true:
1731 // * The ABS merge value is an undef or non-negative
1732 // * The ABS predicate is all active
1733 // * The ABS predicate and the SRSHL predicates are the same
1734 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
1735 AbsPred != Pred && !isAllActivePredicate(AbsPred))
1736 return std::nullopt;
1737
1738 // Only valid when the shift amount is non-negative, otherwise the rounding
1739 // behaviour of SRSHL cannot be ignored.
1740 if (!match(Shift, m_NonNegative()))
1741 return std::nullopt;
1742
1743 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
1744 {II.getType()}, {Pred, Vec, Shift});
1745
1746 return IC.replaceInstUsesWith(II, LSL);
1747}
1748
1749std::optional<Instruction *>
1751 IntrinsicInst &II) const {
1752 Intrinsic::ID IID = II.getIntrinsicID();
1753 switch (IID) {
1754 default:
1755 break;
1756 case Intrinsic::aarch64_neon_fmaxnm:
1757 case Intrinsic::aarch64_neon_fminnm:
1758 return instCombineMaxMinNM(IC, II);
1759 case Intrinsic::aarch64_sve_convert_from_svbool:
1760 return instCombineConvertFromSVBool(IC, II);
1761 case Intrinsic::aarch64_sve_dup:
1762 return instCombineSVEDup(IC, II);
1763 case Intrinsic::aarch64_sve_dup_x:
1764 return instCombineSVEDupX(IC, II);
1765 case Intrinsic::aarch64_sve_cmpne:
1766 case Intrinsic::aarch64_sve_cmpne_wide:
1767 return instCombineSVECmpNE(IC, II);
1768 case Intrinsic::aarch64_sve_rdffr:
1769 return instCombineRDFFR(IC, II);
1770 case Intrinsic::aarch64_sve_lasta:
1771 case Intrinsic::aarch64_sve_lastb:
1772 return instCombineSVELast(IC, II);
1773 case Intrinsic::aarch64_sve_clasta_n:
1774 case Intrinsic::aarch64_sve_clastb_n:
1775 return instCombineSVECondLast(IC, II);
1776 case Intrinsic::aarch64_sve_cntd:
1777 return instCombineSVECntElts(IC, II, 2);
1778 case Intrinsic::aarch64_sve_cntw:
1779 return instCombineSVECntElts(IC, II, 4);
1780 case Intrinsic::aarch64_sve_cnth:
1781 return instCombineSVECntElts(IC, II, 8);
1782 case Intrinsic::aarch64_sve_cntb:
1783 return instCombineSVECntElts(IC, II, 16);
1784 case Intrinsic::aarch64_sve_ptest_any:
1785 case Intrinsic::aarch64_sve_ptest_first:
1786 case Intrinsic::aarch64_sve_ptest_last:
1787 return instCombineSVEPTest(IC, II);
1788 case Intrinsic::aarch64_sve_fabd:
1789 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fabd_u);
1790 case Intrinsic::aarch64_sve_fadd:
1791 return instCombineSVEVectorFAdd(IC, II);
1792 case Intrinsic::aarch64_sve_fadd_u:
1793 return instCombineSVEVectorFAddU(IC, II);
1794 case Intrinsic::aarch64_sve_fdiv:
1795 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fdiv_u);
1796 case Intrinsic::aarch64_sve_fmax:
1797 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmax_u);
1798 case Intrinsic::aarch64_sve_fmaxnm:
1799 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmaxnm_u);
1800 case Intrinsic::aarch64_sve_fmin:
1801 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmin_u);
1802 case Intrinsic::aarch64_sve_fminnm:
1803 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fminnm_u);
1804 case Intrinsic::aarch64_sve_fmla:
1805 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmla_u);
1806 case Intrinsic::aarch64_sve_fmls:
1807 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmls_u);
1808 case Intrinsic::aarch64_sve_fmul:
1809 case Intrinsic::aarch64_sve_fmul_u:
1810 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_fmul_u);
1811 case Intrinsic::aarch64_sve_fmulx:
1812 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fmulx_u);
1813 case Intrinsic::aarch64_sve_fnmla:
1814 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fnmla_u);
1815 case Intrinsic::aarch64_sve_fnmls:
1816 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_fnmls_u);
1817 case Intrinsic::aarch64_sve_fsub:
1818 return instCombineSVEVectorFSub(IC, II);
1819 case Intrinsic::aarch64_sve_fsub_u:
1820 return instCombineSVEVectorFSubU(IC, II);
1821 case Intrinsic::aarch64_sve_add:
1822 return instCombineSVEVectorAdd(IC, II);
1823 case Intrinsic::aarch64_sve_add_u:
1824 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
1825 Intrinsic::aarch64_sve_mla_u>(
1826 IC, II, true);
1827 case Intrinsic::aarch64_sve_mla:
1828 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_mla_u);
1829 case Intrinsic::aarch64_sve_mls:
1830 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_mls_u);
1831 case Intrinsic::aarch64_sve_mul:
1832 case Intrinsic::aarch64_sve_mul_u:
1833 return instCombineSVEVectorMul(IC, II, Intrinsic::aarch64_sve_mul_u);
1834 case Intrinsic::aarch64_sve_sabd:
1835 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_sabd_u);
1836 case Intrinsic::aarch64_sve_smax:
1837 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_smax_u);
1838 case Intrinsic::aarch64_sve_smin:
1839 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_smin_u);
1840 case Intrinsic::aarch64_sve_smulh:
1841 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_smulh_u);
1842 case Intrinsic::aarch64_sve_sub:
1843 return instCombineSVEVectorSub(IC, II);
1844 case Intrinsic::aarch64_sve_sub_u:
1845 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
1846 Intrinsic::aarch64_sve_mls_u>(
1847 IC, II, true);
1848 case Intrinsic::aarch64_sve_uabd:
1849 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_uabd_u);
1850 case Intrinsic::aarch64_sve_umax:
1851 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_umax_u);
1852 case Intrinsic::aarch64_sve_umin:
1853 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_umin_u);
1854 case Intrinsic::aarch64_sve_umulh:
1855 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_umulh_u);
1856 case Intrinsic::aarch64_sve_asr:
1857 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_asr_u);
1858 case Intrinsic::aarch64_sve_lsl:
1859 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_lsl_u);
1860 case Intrinsic::aarch64_sve_lsr:
1861 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_lsr_u);
1862 case Intrinsic::aarch64_sve_and:
1863 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_and_u);
1864 case Intrinsic::aarch64_sve_bic:
1865 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_bic_u);
1866 case Intrinsic::aarch64_sve_eor:
1867 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_eor_u);
1868 case Intrinsic::aarch64_sve_orr:
1869 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_orr_u);
1870 case Intrinsic::aarch64_sve_sqsub:
1871 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_sqsub_u);
1872 case Intrinsic::aarch64_sve_uqsub:
1873 return instCombineSVEAllActive(II, Intrinsic::aarch64_sve_uqsub_u);
1874 case Intrinsic::aarch64_sve_tbl:
1875 return instCombineSVETBL(IC, II);
1876 case Intrinsic::aarch64_sve_uunpkhi:
1877 case Intrinsic::aarch64_sve_uunpklo:
1878 case Intrinsic::aarch64_sve_sunpkhi:
1879 case Intrinsic::aarch64_sve_sunpklo:
1880 return instCombineSVEUnpack(IC, II);
1881 case Intrinsic::aarch64_sve_zip1:
1882 case Intrinsic::aarch64_sve_zip2:
1883 return instCombineSVEZip(IC, II);
1884 case Intrinsic::aarch64_sve_ld1_gather_index:
1885 return instCombineLD1GatherIndex(IC, II);
1886 case Intrinsic::aarch64_sve_st1_scatter_index:
1887 return instCombineST1ScatterIndex(IC, II);
1888 case Intrinsic::aarch64_sve_ld1:
1889 return instCombineSVELD1(IC, II, DL);
1890 case Intrinsic::aarch64_sve_st1:
1891 return instCombineSVEST1(IC, II, DL);
1892 case Intrinsic::aarch64_sve_sdiv:
1893 return instCombineSVESDIV(IC, II);
1894 case Intrinsic::aarch64_sve_sel:
1895 return instCombineSVESel(IC, II);
1896 case Intrinsic::aarch64_sve_srshl:
1897 return instCombineSVESrshl(IC, II);
1898 case Intrinsic::aarch64_sve_dupq_lane:
1899 return instCombineSVEDupqLane(IC, II);
1900 }
1901
1902 return std::nullopt;
1903}
1904
1906 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
1907 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
1908 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1909 SimplifyAndSetOp) const {
1910 switch (II.getIntrinsicID()) {
1911 default:
1912 break;
1913 case Intrinsic::aarch64_neon_fcvtxn:
1914 case Intrinsic::aarch64_neon_rshrn:
1915 case Intrinsic::aarch64_neon_sqrshrn:
1916 case Intrinsic::aarch64_neon_sqrshrun:
1917 case Intrinsic::aarch64_neon_sqshrn:
1918 case Intrinsic::aarch64_neon_sqshrun:
1919 case Intrinsic::aarch64_neon_sqxtn:
1920 case Intrinsic::aarch64_neon_sqxtun:
1921 case Intrinsic::aarch64_neon_uqrshrn:
1922 case Intrinsic::aarch64_neon_uqshrn:
1923 case Intrinsic::aarch64_neon_uqxtn:
1924 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
1925 break;
1926 }
1927
1928 return std::nullopt;
1929}
1930
1933 switch (K) {
1935 return TypeSize::getFixed(64);
1938 return TypeSize::getFixed(0);
1939
1940 if (ST->hasSVE())
1941 return TypeSize::getFixed(
1942 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
1943
1944 return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
1947 return TypeSize::getScalable(0);
1948
1949 return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
1950 }
1951 llvm_unreachable("Unsupported register kind");
1952}
1953
1954bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
1956 Type *SrcOverrideTy) {
1957 // A helper that returns a vector type from the given type. The number of
1958 // elements in type Ty determines the vector width.
1959 auto toVectorTy = [&](Type *ArgTy) {
1960 return VectorType::get(ArgTy->getScalarType(),
1961 cast<VectorType>(DstTy)->getElementCount());
1962 };
1963
1964 // Exit early if DstTy is not a vector type whose elements are one of [i16,
1965 // i32, i64]. SVE doesn't generally have the same set of instructions to
1966 // perform an extend with the add/sub/mul. There are SMULLB style
1967 // instructions, but they operate on top/bottom, requiring some sort of lane
1968 // interleaving to be used with zext/sext.
1969 unsigned DstEltSize = DstTy->getScalarSizeInBits();
1970 if (!useNeonVector(DstTy) || Args.size() != 2 ||
1971 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
1972 return false;
1973
1974 // Determine if the operation has a widening variant. We consider both the
1975 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
1976 // instructions.
1977 //
1978 // TODO: Add additional widening operations (e.g., shl, etc.) once we
1979 // verify that their extending operands are eliminated during code
1980 // generation.
1981 Type *SrcTy = SrcOverrideTy;
1982 switch (Opcode) {
1983 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
1984 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
1985 // The second operand needs to be an extend
1986 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
1987 if (!SrcTy)
1988 SrcTy =
1989 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
1990 } else
1991 return false;
1992 break;
1993 case Instruction::Mul: { // SMULL(2), UMULL(2)
1994 // Both operands need to be extends of the same type.
1995 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
1996 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
1997 if (!SrcTy)
1998 SrcTy =
1999 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
2000 } else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
2001 // If one of the operands is a Zext and the other has enough zero bits to
2002 // be treated as unsigned, we can still general a umull, meaning the zext
2003 // is free.
2004 KnownBits Known =
2005 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
2006 if (Args[0]->getType()->getScalarSizeInBits() -
2007 Known.Zero.countLeadingOnes() >
2008 DstTy->getScalarSizeInBits() / 2)
2009 return false;
2010 if (!SrcTy)
2011 SrcTy = toVectorTy(Type::getIntNTy(DstTy->getContext(),
2012 DstTy->getScalarSizeInBits() / 2));
2013 } else
2014 return false;
2015 break;
2016 }
2017 default:
2018 return false;
2019 }
2020
2021 // Legalize the destination type and ensure it can be used in a widening
2022 // operation.
2023 auto DstTyL = getTypeLegalizationCost(DstTy);
2024 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
2025 return false;
2026
2027 // Legalize the source type and ensure it can be used in a widening
2028 // operation.
2029 assert(SrcTy && "Expected some SrcTy");
2030 auto SrcTyL = getTypeLegalizationCost(SrcTy);
2031 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2032 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
2033 return false;
2034
2035 // Get the total number of vector elements in the legalized types.
2036 InstructionCost NumDstEls =
2037 DstTyL.first * DstTyL.second.getVectorMinNumElements();
2038 InstructionCost NumSrcEls =
2039 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2040
2041 // Return true if the legalized types have the same number of vector elements
2042 // and the destination element type size is twice that of the source type.
2043 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
2044}
2045
2046// s/urhadd instructions implement the following pattern, making the
2047// extends free:
2048// %x = add ((zext i8 -> i16), 1)
2049// %y = (zext i8 -> i16)
2050// trunc i16 (lshr (add %x, %y), 1) -> i8
2051//
2053 Type *Src) {
2054 // The source should be a legal vector type.
2055 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
2056 (Src->isScalableTy() && !ST->hasSVE2()))
2057 return false;
2058
2059 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
2060 return false;
2061
2062 // Look for trunc/shl/add before trying to match the pattern.
2063 const Instruction *Add = ExtUser;
2064 auto *AddUser =
2065 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2066 if (AddUser && AddUser->getOpcode() == Instruction::Add)
2067 Add = AddUser;
2068
2069 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
2070 if (!Shr || Shr->getOpcode() != Instruction::LShr)
2071 return false;
2072
2073 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2074 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2075 Src->getScalarSizeInBits() !=
2076 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2077 return false;
2078
2079 // Try to match the whole pattern. Ext could be either the first or second
2080 // m_ZExtOrSExt matched.
2081 Instruction *Ex1, *Ex2;
2082 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
2083 m_c_Add(m_Instruction(Ex2), m_SpecificInt(1))))))
2084 return false;
2085
2086 // Ensure both extends are of the same type
2087 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
2088 Ex1->getOpcode() == Ex2->getOpcode())
2089 return true;
2090
2091 return false;
2092}
2093
2095 Type *Src,
2098 const Instruction *I) {
2099 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2100 assert(ISD && "Invalid opcode");
2101 // If the cast is observable, and it is used by a widening instruction (e.g.,
2102 // uaddl, saddw, etc.), it may be free.
2103 if (I && I->hasOneUser()) {
2104 auto *SingleUser = cast<Instruction>(*I->user_begin());
2105 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
2106 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands, Src)) {
2107 // For adds only count the second operand as free if both operands are
2108 // extends but not the same operation. (i.e both operands are not free in
2109 // add(sext, zext)).
2110 if (SingleUser->getOpcode() == Instruction::Add) {
2111 if (I == SingleUser->getOperand(1) ||
2112 (isa<CastInst>(SingleUser->getOperand(1)) &&
2113 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
2114 return 0;
2115 } else // Others are free so long as isWideningInstruction returned true.
2116 return 0;
2117 }
2118
2119 // The cast will be free for the s/urhadd instructions
2120 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
2121 isExtPartOfAvgExpr(SingleUser, Dst, Src))
2122 return 0;
2123 }
2124
2125 // TODO: Allow non-throughput costs that aren't binary.
2126 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
2128 return Cost == 0 ? 0 : 1;
2129 return Cost;
2130 };
2131
2132 EVT SrcTy = TLI->getValueType(DL, Src);
2133 EVT DstTy = TLI->getValueType(DL, Dst);
2134
2135 if (!SrcTy.isSimple() || !DstTy.isSimple())
2136 return AdjustCost(
2137 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2138
2139 static const TypeConversionCostTblEntry
2140 ConversionTbl[] = {
2141 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
2142 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
2143 { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
2144 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
2145 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
2146 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
2147 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
2148 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
2149 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
2150 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
2151 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
2152 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
2153 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
2154 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
2155 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
2156 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
2157 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
2158 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
2159 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
2160 { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
2161
2162 // Truncations on nxvmiN
2163 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 1 },
2164 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 1 },
2165 { ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 1 },
2166 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 1 },
2167 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 1 },
2168 { ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 2 },
2169 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 1 },
2170 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 3 },
2171 { ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 5 },
2172 { ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 1 },
2173 { ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 1 },
2174 { ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 1 },
2175 { ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 1 },
2176 { ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 2 },
2177 { ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 3 },
2178 { ISD::TRUNCATE, MVT::nxv8i32, MVT::nxv8i64, 6 },
2179
2180 // The number of shll instructions for the extension.
2181 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
2182 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
2183 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2184 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2 },
2185 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
2186 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
2187 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2188 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2 },
2189 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
2190 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
2191 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
2192 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
2193 { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2194 { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2 },
2195 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
2196 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
2197
2198 // LowerVectorINT_TO_FP:
2199 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2200 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2201 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2202 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
2203 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
2204 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
2205
2206 // Complex: to v2f32
2207 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
2208 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
2209 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
2210 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
2211 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3 },
2212 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i64, 2 },
2213
2214 // Complex: to v4f32
2215 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4 },
2216 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
2217 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
2218 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
2219
2220 // Complex: to v8f32
2221 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
2222 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2223 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10 },
2224 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
2225
2226 // Complex: to v16f32
2227 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
2228 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21 },
2229
2230 // Complex: to v2f64
2231 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
2232 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
2233 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
2234 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
2235 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4 },
2236 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
2237
2238 // Complex: to v4f64
2239 { ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
2240 { ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4 },
2241
2242 // LowerVectorFP_TO_INT
2243 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1 },
2244 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
2245 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
2246 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
2247 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
2248 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
2249
2250 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
2251 { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2 },
2252 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1 },
2253 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1 },
2254 { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2 },
2255 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1 },
2256 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1 },
2257
2258 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
2259 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
2260 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2 },
2261 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
2262 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2 },
2263
2264 // Complex, from nxv2f32.
2265 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2266 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2267 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2268 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
2269 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1 },
2270 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1 },
2271 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1 },
2272 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1 },
2273
2274 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
2275 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
2276 { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
2277 { ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2 },
2278 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
2279 { ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2 },
2280 { ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2 },
2281
2282 // Complex, from nxv2f64.
2283 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2284 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2285 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2286 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
2287 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1 },
2288 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1 },
2289 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1 },
2290 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1 },
2291
2292 // Complex, from nxv4f32.
2293 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2294 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2295 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2296 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
2297 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4 },
2298 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1 },
2299 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1 },
2300 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1 },
2301
2302 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
2303 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2304 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2305 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7 },
2306 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7 },
2307
2308 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
2309 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2310 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2311 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2312 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3 },
2313 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3 },
2314 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3 },
2315
2316 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
2317 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2318 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2319 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3 },
2320 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3 },
2321
2322 // Complex, from nxv8f16.
2323 { ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2324 { ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2325 { ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2326 { ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2327 { ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10 },
2328 { ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4 },
2329 { ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1 },
2330 { ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1 },
2331
2332 // Complex, from nxv4f16.
2333 { ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2334 { ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2335 { ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2336 { ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2337 { ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4 },
2338 { ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1 },
2339 { ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1 },
2340 { ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1 },
2341
2342 // Complex, from nxv2f16.
2343 { ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2344 { ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2345 { ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2346 { ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2347 { ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1 },
2348 { ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1 },
2349 { ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1 },
2350 { ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1 },
2351
2352 // Truncate from nxvmf32 to nxvmf16.
2353 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1 },
2354 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1 },
2355 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3 },
2356
2357 // Truncate from nxvmf64 to nxvmf16.
2358 { ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1 },
2359 { ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3 },
2360 { ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7 },
2361
2362 // Truncate from nxvmf64 to nxvmf32.
2363 { ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1 },
2364 { ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3 },
2365 { ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6 },
2366
2367 // Extend from nxvmf16 to nxvmf32.
2368 { ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
2369 { ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
2370 { ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
2371
2372 // Extend from nxvmf16 to nxvmf64.
2373 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
2374 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
2375 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
2376
2377 // Extend from nxvmf32 to nxvmf64.
2378 { ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
2379 { ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
2380 { ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
2381
2382 // Bitcasts from float to integer
2383 { ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0 },
2384 { ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0 },
2385 { ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0 },
2386
2387 // Bitcasts from integer to float
2388 { ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0 },
2389 { ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0 },
2390 { ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0 },
2391
2392 // Add cost for extending to illegal -too wide- scalable vectors.
2393 // zero/sign extend are implemented by multiple unpack operations,
2394 // where each operation has a cost of 1.
2395 { ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
2396 { ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
2397 { ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
2398 { ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
2399 { ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
2400 { ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
2401
2402 { ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
2403 { ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
2404 { ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
2405 { ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
2406 { ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
2407 { ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
2408 };
2409
2410 // We have to estimate a cost of fixed length operation upon
2411 // SVE registers(operations) with the number of registers required
2412 // for a fixed type to be represented upon SVE registers.
2413 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
2414 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
2415 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
2416 ST->useSVEForFixedLengthVectors(WiderTy)) {
2417 std::pair<InstructionCost, MVT> LT =
2418 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
2419 unsigned NumElements = AArch64::SVEBitsPerBlock /
2420 LT.second.getVectorElementType().getSizeInBits();
2421 return AdjustCost(
2422 LT.first *
2424 Opcode, ScalableVectorType::get(Dst->getScalarType(), NumElements),
2425 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
2426 CostKind, I));
2427 }
2428
2429 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
2430 DstTy.getSimpleVT(),
2431 SrcTy.getSimpleVT()))
2432 return AdjustCost(Entry->Cost);
2433
2434 static const TypeConversionCostTblEntry FP16Tbl[] = {
2435 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
2436 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
2437 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
2438 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
2439 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
2440 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
2441 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
2442 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
2443 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
2444 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
2445 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
2446 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
2447 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
2448 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
2449 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
2450 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
2451 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
2452 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
2453 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
2454 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
2455 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
2456 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
2457 };
2458
2459 if (ST->hasFullFP16())
2460 if (const auto *Entry = ConvertCostTableLookup(
2461 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
2462 return AdjustCost(Entry->Cost);
2463
2464 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
2465 // but we also want to include the TTI::CastContextHint::Masked case too.
2466 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
2467 CCH == TTI::CastContextHint::Masked && ST->hasSVEorSME() &&
2468 TLI->isTypeLegal(DstTy))
2470
2471 return AdjustCost(
2472 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2473}
2474
2476 Type *Dst,
2477 VectorType *VecTy,
2478 unsigned Index) {
2479
2480 // Make sure we were given a valid extend opcode.
2481 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
2482 "Invalid opcode");
2483
2484 // We are extending an element we extract from a vector, so the source type
2485 // of the extend is the element type of the vector.
2486 auto *Src = VecTy->getElementType();
2487
2488 // Sign- and zero-extends are for integer types only.
2489 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
2490
2491 // Get the cost for the extract. We compute the cost (if any) for the extend
2492 // below.
2494 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
2495 CostKind, Index, nullptr, nullptr);
2496
2497 // Legalize the types.
2498 auto VecLT = getTypeLegalizationCost(VecTy);
2499 auto DstVT = TLI->getValueType(DL, Dst);
2500 auto SrcVT = TLI->getValueType(DL, Src);
2501
2502 // If the resulting type is still a vector and the destination type is legal,
2503 // we may get the extension for free. If not, get the default cost for the
2504 // extend.
2505 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
2506 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2507 CostKind);
2508
2509 // The destination type should be larger than the element type. If not, get
2510 // the default cost for the extend.
2511 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2512 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2513 CostKind);
2514
2515 switch (Opcode) {
2516 default:
2517 llvm_unreachable("Opcode should be either SExt or ZExt");
2518
2519 // For sign-extends, we only need a smov, which performs the extension
2520 // automatically.
2521 case Instruction::SExt:
2522 return Cost;
2523
2524 // For zero-extends, the extend is performed automatically by a umov unless
2525 // the destination type is i64 and the element type is i8 or i16.
2526 case Instruction::ZExt:
2527 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
2528 return Cost;
2529 }
2530
2531 // If we are unable to perform the extend for free, get the default cost.
2532 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2533 CostKind);
2534}
2535
2538 const Instruction *I) {
2540 return Opcode == Instruction::PHI ? 0 : 1;
2541 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
2542 // Branches are assumed to be predicted.
2543 return 0;
2544}
2545
2546InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(const Instruction *I,
2547 Type *Val,
2548 unsigned Index,
2549 bool HasRealUse) {
2550 assert(Val->isVectorTy() && "This must be a vector type");
2551
2552 if (Index != -1U) {
2553 // Legalize the type.
2554 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2555
2556 // This type is legalized to a scalar type.
2557 if (!LT.second.isVector())
2558 return 0;
2559
2560 // The type may be split. For fixed-width vectors we can normalize the
2561 // index to the new type.
2562 if (LT.second.isFixedLengthVector()) {
2563 unsigned Width = LT.second.getVectorNumElements();
2564 Index = Index % Width;
2565 }
2566
2567 // The element at index zero is already inside the vector.
2568 // - For a physical (HasRealUse==true) insert-element or extract-element
2569 // instruction that extracts integers, an explicit FPR -> GPR move is
2570 // needed. So it has non-zero cost.
2571 // - For the rest of cases (virtual instruction or element type is float),
2572 // consider the instruction free.
2573 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
2574 return 0;
2575
2576 // This is recognising a LD1 single-element structure to one lane of one
2577 // register instruction. I.e., if this is an `insertelement` instruction,
2578 // and its second operand is a load, then we will generate a LD1, which
2579 // are expensive instructions.
2580 if (I && dyn_cast<LoadInst>(I->getOperand(1)))
2581 return ST->getVectorInsertExtractBaseCost() + 1;
2582
2583 // i1 inserts and extract will include an extra cset or cmp of the vector
2584 // value. Increase the cost by 1 to account.
2585 if (Val->getScalarSizeInBits() == 1)
2586 return ST->getVectorInsertExtractBaseCost() + 1;
2587
2588 // FIXME:
2589 // If the extract-element and insert-element instructions could be
2590 // simplified away (e.g., could be combined into users by looking at use-def
2591 // context), they have no cost. This is not done in the first place for
2592 // compile-time considerations.
2593 }
2594
2595 // All other insert/extracts cost this much.
2596 return ST->getVectorInsertExtractBaseCost();
2597}
2598
2601 unsigned Index, Value *Op0,
2602 Value *Op1) {
2603 bool HasRealUse =
2604 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
2605 return getVectorInstrCostHelper(nullptr, Val, Index, HasRealUse);
2606}
2607
2609 Type *Val,
2611 unsigned Index) {
2612 return getVectorInstrCostHelper(&I, Val, Index, true /* HasRealUse */);
2613}
2614
2616 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
2618 if (isa<ScalableVectorType>(Ty))
2620 if (Ty->getElementType()->isFloatingPointTy())
2621 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
2622 CostKind);
2623 return DemandedElts.popcount() * (Insert + Extract) *
2625}
2626
2628 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2631 const Instruction *CxtI) {
2632
2633 // TODO: Handle more cost kinds.
2635 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2636 Op2Info, Args, CxtI);
2637
2638 // Legalize the type.
2639 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2640 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2641
2642 switch (ISD) {
2643 default:
2644 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2645 Op2Info);
2646 case ISD::SDIV:
2647 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
2648 // On AArch64, scalar signed division by constants power-of-two are
2649 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
2650 // The OperandValue properties many not be same as that of previous
2651 // operation; conservatively assume OP_None.
2653 Instruction::Add, Ty, CostKind,
2654 Op1Info.getNoProps(), Op2Info.getNoProps());
2655 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
2656 Op1Info.getNoProps(), Op2Info.getNoProps());
2658 Instruction::Select, Ty, CostKind,
2659 Op1Info.getNoProps(), Op2Info.getNoProps());
2660 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
2661 Op1Info.getNoProps(), Op2Info.getNoProps());
2662 return Cost;
2663 }
2664 [[fallthrough]];
2665 case ISD::UDIV: {
2666 if (Op2Info.isConstant() && Op2Info.isUniform()) {
2667 auto VT = TLI->getValueType(DL, Ty);
2668 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
2669 // Vector signed division by constant are expanded to the
2670 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
2671 // to MULHS + SUB + SRL + ADD + SRL.
2673 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2675 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2677 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2678 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
2679 }
2680 }
2681
2683 Opcode, Ty, CostKind, Op1Info, Op2Info);
2684 if (Ty->isVectorTy()) {
2685 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
2686 // SDIV/UDIV operations are lowered using SVE, then we can have less
2687 // costs.
2688 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
2689 ->getPrimitiveSizeInBits()
2690 .getFixedValue() < 128) {
2691 EVT VT = TLI->getValueType(DL, Ty);
2692 static const CostTblEntry DivTbl[]{
2693 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
2694 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
2695 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
2696 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
2697 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
2698 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
2699
2700 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
2701 if (nullptr != Entry)
2702 return Entry->Cost;
2703 }
2704 // For 8/16-bit elements, the cost is higher because the type
2705 // requires promotion and possibly splitting:
2706 if (LT.second.getScalarType() == MVT::i8)
2707 Cost *= 8;
2708 else if (LT.second.getScalarType() == MVT::i16)
2709 Cost *= 4;
2710 return Cost;
2711 } else {
2712 // If one of the operands is a uniform constant then the cost for each
2713 // element is Cost for insertion, extraction and division.
2714 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
2715 // operation with scalar type
2716 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
2717 (Op2Info.isConstant() && Op2Info.isUniform())) {
2718 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2720 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
2721 return (4 + DivCost) * VTy->getNumElements();
2722 }
2723 }
2724 // On AArch64, without SVE, vector divisions are expanded
2725 // into scalar divisions of each pair of elements.
2726 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
2727 CostKind, Op1Info, Op2Info);
2728 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
2729 Op1Info, Op2Info);
2730 }
2731
2732 // TODO: if one of the arguments is scalar, then it's not necessary to
2733 // double the cost of handling the vector elements.
2734 Cost += Cost;
2735 }
2736 return Cost;
2737 }
2738 case ISD::MUL:
2739 // When SVE is available, then we can lower the v2i64 operation using
2740 // the SVE mul instruction, which has a lower cost.
2741 if (LT.second == MVT::v2i64 && ST->hasSVE())
2742 return LT.first;
2743
2744 // When SVE is not available, there is no MUL.2d instruction,
2745 // which means mul <2 x i64> is expensive as elements are extracted
2746 // from the vectors and the muls scalarized.
2747 // As getScalarizationOverhead is a bit too pessimistic, we
2748 // estimate the cost for a i64 vector directly here, which is:
2749 // - four 2-cost i64 extracts,
2750 // - two 2-cost i64 inserts, and
2751 // - two 1-cost muls.
2752 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
2753 // LT.first = 2 the cost is 28. If both operands are extensions it will not
2754 // need to scalarize so the cost can be cheaper (smull or umull).
2755 // so the cost can be cheaper (smull or umull).
2756 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
2757 return LT.first;
2758 return LT.first * 14;
2759 case ISD::ADD:
2760 case ISD::XOR:
2761 case ISD::OR:
2762 case ISD::AND:
2763 case ISD::SRL:
2764 case ISD::SRA:
2765 case ISD::SHL:
2766 // These nodes are marked as 'custom' for combining purposes only.
2767 // We know that they are legal. See LowerAdd in ISelLowering.
2768 return LT.first;
2769
2770 case ISD::FNEG:
2771 case ISD::FADD:
2772 case ISD::FSUB:
2773 // Increase the cost for half and bfloat types if not architecturally
2774 // supported.
2775 if ((Ty->getScalarType()->isHalfTy() && !ST->hasFullFP16()) ||
2776 (Ty->getScalarType()->isBFloatTy() && !ST->hasBF16()))
2777 return 2 * LT.first;
2778 if (!Ty->getScalarType()->isFP128Ty())
2779 return LT.first;
2780 [[fallthrough]];
2781 case ISD::FMUL:
2782 case ISD::FDIV:
2783 // These nodes are marked as 'custom' just to lower them to SVE.
2784 // We know said lowering will incur no additional cost.
2785 if (!Ty->getScalarType()->isFP128Ty())
2786 return 2 * LT.first;
2787
2788 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2789 Op2Info);
2790 }
2791}
2792
2794 ScalarEvolution *SE,
2795 const SCEV *Ptr) {
2796 // Address computations in vectorized code with non-consecutive addresses will
2797 // likely result in more instructions compared to scalar code where the
2798 // computation can more often be merged into the index mode. The resulting
2799 // extra micro-ops can significantly decrease throughput.
2800 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
2801 int MaxMergeDistance = 64;
2802
2803 if (Ty->isVectorTy() && SE &&
2804 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
2805 return NumVectorInstToHideOverhead;
2806
2807 // In many cases the address computation is not merged into the instruction
2808 // addressing mode.
2809 return 1;
2810}
2811
2813 Type *CondTy,
2814 CmpInst::Predicate VecPred,
2816 const Instruction *I) {
2817 // TODO: Handle other cost kinds.
2819 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2820 I);
2821
2822 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2823 // We don't lower some vector selects well that are wider than the register
2824 // width.
2825 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
2826 // We would need this many instructions to hide the scalarization happening.
2827 const int AmortizationCost = 20;
2828
2829 // If VecPred is not set, check if we can get a predicate from the context
2830 // instruction, if its type matches the requested ValTy.
2831 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
2832 CmpInst::Predicate CurrentPred;
2833 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
2834 m_Value())))
2835 VecPred = CurrentPred;
2836 }
2837 // Check if we have a compare/select chain that can be lowered using
2838 // a (F)CMxx & BFI pair.
2839 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
2840 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
2841 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
2842 VecPred == CmpInst::FCMP_UNE) {
2843 static const auto ValidMinMaxTys = {
2844 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
2845 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
2846 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
2847
2848 auto LT = getTypeLegalizationCost(ValTy);
2849 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
2850 (ST->hasFullFP16() &&
2851 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
2852 return LT.first;
2853 }
2854
2855 static const TypeConversionCostTblEntry
2856 VectorSelectTbl[] = {
2857 { ISD::SELECT, MVT::v2i1, MVT::v2f32, 2 },
2858 { ISD::SELECT, MVT::v2i1, MVT::v2f64, 2 },
2859 { ISD::SELECT, MVT::v4i1, MVT::v4f32, 2 },
2860 { ISD::SELECT, MVT::v4i1, MVT::v4f16, 2 },
2861 { ISD::SELECT, MVT::v8i1, MVT::v8f16, 2 },
2862 { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 },
2863 { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 },
2864 { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 },
2865 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
2866 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
2867 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
2868 };
2869
2870 EVT SelCondTy = TLI->getValueType(DL, CondTy);
2871 EVT SelValTy = TLI->getValueType(DL, ValTy);
2872 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
2873 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
2874 SelCondTy.getSimpleVT(),
2875 SelValTy.getSimpleVT()))
2876 return Entry->Cost;
2877 }
2878 }
2879
2880 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SETCC) {
2881 auto LT = getTypeLegalizationCost(ValTy);
2882 // Cost v4f16 FCmp without FP16 support via converting to v4f32 and back.
2883 if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
2884 return LT.first * 4; // fcvtl + fcvtl + fcmp + xtn
2885 }
2886
2887 // Treat the icmp in icmp(and, 0) as free, as we can make use of ands.
2888 // FIXME: This can apply to more conditions and add/sub if it can be shown to
2889 // be profitable.
2890 if (ValTy->isIntegerTy() && ISD == ISD::SETCC && I &&
2891 ICmpInst::isEquality(VecPred) &&
2892 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
2893 match(I->getOperand(1), m_Zero()) &&
2894 match(I->getOperand(0), m_And(m_Value(), m_Value())))
2895 return 0;
2896
2897 // The base case handles scalable vectors fine for now, since it treats the
2898 // cost as 1 * legalization cost.
2899 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2900}
2901
2903AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
2905 if (ST->requiresStrictAlign()) {
2906 // TODO: Add cost modeling for strict align. Misaligned loads expand to
2907 // a bunch of instructions when strict align is enabled.
2908 return Options;
2909 }
2910 Options.AllowOverlappingLoads = true;
2911 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
2912 Options.NumLoadsPerBlock = Options.MaxNumLoads;
2913 // TODO: Though vector loads usually perform well on AArch64, in some targets
2914 // they may wake up the FP unit, which raises the power consumption. Perhaps
2915 // they could be used with no holds barred (-O3).
2916 Options.LoadSizes = {8, 4, 2, 1};
2917 return Options;
2918}
2919
2921 return ST->hasSVE();
2922}
2923
2926 Align Alignment, unsigned AddressSpace,
2928 if (useNeonVector(Src))
2929 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2930 CostKind);
2931 auto LT = getTypeLegalizationCost(Src);
2932 if (!LT.first.isValid())
2934
2935 // The code-generator is currently not able to handle scalable vectors
2936 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2937 // it. This change will be removed when code-generation for these types is
2938 // sufficiently reliable.
2939 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
2941
2942 return LT.first;
2943}
2944
2945static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
2946 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
2947}
2948
2950 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
2951 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
2952 if (useNeonVector(DataTy))
2953 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
2954 Alignment, CostKind, I);
2955 auto *VT = cast<VectorType>(DataTy);
2956 auto LT = getTypeLegalizationCost(DataTy);
2957 if (!LT.first.isValid())
2959
2960 // The code-generator is currently not able to handle scalable vectors
2961 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2962 // it. This change will be removed when code-generation for these types is
2963 // sufficiently reliable.
2964 if (cast<VectorType>(DataTy)->getElementCount() ==
2967
2968 ElementCount LegalVF = LT.second.getVectorElementCount();
2969 InstructionCost MemOpCost =
2970 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
2971 {TTI::OK_AnyValue, TTI::OP_None}, I);
2972 // Add on an overhead cost for using gathers/scatters.
2973 // TODO: At the moment this is applied unilaterally for all CPUs, but at some
2974 // point we may want a per-CPU overhead.
2975 MemOpCost *= getSVEGatherScatterOverhead(Opcode);
2976 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
2977}
2978
2980 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
2981}
2982
2984 MaybeAlign Alignment,
2985 unsigned AddressSpace,
2987 TTI::OperandValueInfo OpInfo,
2988 const Instruction *I) {
2989 EVT VT = TLI->getValueType(DL, Ty, true);
2990 // Type legalization can't handle structs
2991 if (VT == MVT::Other)
2992 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
2993 CostKind);
2994
2995 auto LT = getTypeLegalizationCost(Ty);
2996 if (!LT.first.isValid())
2998
2999 // The code-generator is currently not able to handle scalable vectors
3000 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3001 // it. This change will be removed when code-generation for these types is
3002 // sufficiently reliable.
3003 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3004 if (VTy->getElementCount() == ElementCount::getScalable(1))
3006
3007 // TODO: consider latency as well for TCK_SizeAndLatency.
3009 return LT.first;
3010
3012 return 1;
3013
3014 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3015 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
3016 // Unaligned stores are extremely inefficient. We don't split all
3017 // unaligned 128-bit stores because the negative impact that has shown in
3018 // practice on inlined block copy code.
3019 // We make such stores expensive so that we will only vectorize if there
3020 // are 6 other instructions getting vectorized.
3021 const int AmortizationCost = 6;
3022
3023 return LT.first * 2 * AmortizationCost;
3024 }
3025
3026 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
3027 if (Ty->isPtrOrPtrVectorTy())
3028 return LT.first;
3029
3030 // Check truncating stores and extending loads.
3031 if (useNeonVector(Ty) &&
3032 Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
3033 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
3034 if (VT == MVT::v4i8)
3035 return 2;
3036 // Otherwise we need to scalarize.
3037 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3038 }
3039
3040 return LT.first;
3041}
3042
3044 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
3045 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
3046 bool UseMaskForCond, bool UseMaskForGaps) {
3047 assert(Factor >= 2 && "Invalid interleave factor");
3048 auto *VecVTy = cast<VectorType>(VecTy);
3049
3050 if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2))
3052
3053 // Vectorization for masked interleaved accesses is only enabled for scalable
3054 // VF.
3055 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
3057
3058 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
3059 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
3060 auto *SubVecTy =
3061 VectorType::get(VecVTy->getElementType(),
3062 VecVTy->getElementCount().divideCoefficientBy(Factor));
3063
3064 // ldN/stN only support legal vector types of size 64 or 128 in bits.
3065 // Accesses having vector types that are a multiple of 128 bits can be
3066 // matched to more than one ldN/stN instruction.
3067 bool UseScalable;
3068 if (MinElts % Factor == 0 &&
3069 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
3070 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
3071 }
3072
3073 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
3074 Alignment, AddressSpace, CostKind,
3075 UseMaskForCond, UseMaskForGaps);
3076}
3077
3082 for (auto *I : Tys) {
3083 if (!I->isVectorTy())
3084 continue;
3085 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
3086 128)
3087 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
3088 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
3089 }
3090 return Cost;
3091}
3092
3094 return ST->getMaxInterleaveFactor();
3095}
3096
3097// For Falkor, we want to avoid having too many strided loads in a loop since
3098// that can exhaust the HW prefetcher resources. We adjust the unroller
3099// MaxCount preference below to attempt to ensure unrolling doesn't create too
3100// many strided loads.
3101static void
3104 enum { MaxStridedLoads = 7 };
3105 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
3106 int StridedLoads = 0;
3107 // FIXME? We could make this more precise by looking at the CFG and
3108 // e.g. not counting loads in each side of an if-then-else diamond.
3109 for (const auto BB : L->blocks()) {
3110 for (auto &I : *BB) {
3111 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
3112 if (!LMemI)
3113 continue;
3114
3115 Value *PtrValue = LMemI->getPointerOperand();
3116 if (L->isLoopInvariant(PtrValue))
3117 continue;
3118
3119 const SCEV *LSCEV = SE.getSCEV(PtrValue);
3120 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
3121 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
3122 continue;
3123
3124 // FIXME? We could take pairing of unrolled load copies into account
3125 // by looking at the AddRec, but we would probably have to limit this
3126 // to loops with no stores or other memory optimization barriers.
3127 ++StridedLoads;
3128 // We've seen enough strided loads that seeing more won't make a
3129 // difference.
3130 if (StridedLoads > MaxStridedLoads / 2)
3131 return StridedLoads;
3132 }
3133 }
3134 return StridedLoads;
3135 };
3136
3137 int StridedLoads = countStridedLoads(L, SE);
3138 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
3139 << " strided loads\n");
3140 // Pick the largest power of 2 unroll count that won't result in too many
3141 // strided loads.
3142 if (StridedLoads) {
3143 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
3144 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
3145 << UP.MaxCount << '\n');
3146 }
3147}
3148
3152 // Enable partial unrolling and runtime unrolling.
3153 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
3154
3155 UP.UpperBound = true;
3156
3157 // For inner loop, it is more likely to be a hot one, and the runtime check
3158 // can be promoted out from LICM pass, so the overhead is less, let's try
3159 // a larger threshold to unroll more loops.
3160 if (L->getLoopDepth() > 1)
3161 UP.PartialThreshold *= 2;
3162
3163 // Disable partial & runtime unrolling on -Os.
3165
3169
3170 // Scan the loop: don't unroll loops with calls as this could prevent
3171 // inlining. Don't unroll vector loops either, as they don't benefit much from
3172 // unrolling.
3173 for (auto *BB : L->getBlocks()) {
3174 for (auto &I : *BB) {
3175 // Don't unroll vectorised loop.
3176 if (I.getType()->isVectorTy())
3177 return;
3178
3179 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
3180 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
3181 if (!isLoweredToCall(F))
3182 continue;
3183 }
3184 return;
3185 }
3186 }
3187 }
3188
3189 // Enable runtime unrolling for in-order models
3190 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
3191 // checking for that case, we can ensure that the default behaviour is
3192 // unchanged
3194 !ST->getSchedModel().isOutOfOrder()) {
3195 UP.Runtime = true;
3196 UP.Partial = true;
3197 UP.UnrollRemainder = true;
3199
3200 UP.UnrollAndJam = true;
3202 }
3203}
3204
3208}
3209
3211 Type *ExpectedType) {
3212 switch (Inst->getIntrinsicID()) {
3213 default:
3214 return nullptr;
3215 case Intrinsic::aarch64_neon_st2:
3216 case Intrinsic::aarch64_neon_st3:
3217 case Intrinsic::aarch64_neon_st4: {
3218 // Create a struct type
3219 StructType *ST = dyn_cast<StructType>(ExpectedType);
3220 if (!ST)
3221 return nullptr;
3222 unsigned NumElts = Inst->arg_size() - 1;
3223 if (ST->getNumElements() != NumElts)
3224 return nullptr;
3225 for (unsigned i = 0, e = NumElts; i != e; ++i) {
3226 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
3227 return nullptr;
3228 }
3229 Value *Res = PoisonValue::get(ExpectedType);
3230 IRBuilder<> Builder(Inst);
3231 for (unsigned i = 0, e = NumElts; i != e; ++i) {
3232 Value *L = Inst->getArgOperand(i);
3233 Res = Builder.CreateInsertValue(Res, L, i);
3234 }
3235 return Res;
3236 }
3237 case Intrinsic::aarch64_neon_ld2:
3238 case Intrinsic::aarch64_neon_ld3:
3239 case Intrinsic::aarch64_neon_ld4:
3240 if (Inst->getType() == ExpectedType)
3241 return Inst;
3242 return nullptr;
3243 }
3244}
3245
3247 MemIntrinsicInfo &Info) {
3248 switch (Inst->getIntrinsicID()) {
3249 default:
3250 break;
3251 case Intrinsic::aarch64_neon_ld2:
3252 case Intrinsic::aarch64_neon_ld3:
3253 case Intrinsic::aarch64_neon_ld4:
3254 Info.ReadMem = true;
3255 Info.WriteMem = false;
3256 Info.PtrVal = Inst->getArgOperand(0);
3257 break;
3258 case Intrinsic::aarch64_neon_st2:
3259 case Intrinsic::aarch64_neon_st3:
3260 case Intrinsic::aarch64_neon_st4:
3261 Info.ReadMem = false;
3262 Info.WriteMem = true;
3263 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
3264 break;
3265 }
3266
3267 switch (Inst->getIntrinsicID()) {
3268 default:
3269 return false;
3270 case Intrinsic::aarch64_neon_ld2:
3271 case Intrinsic::aarch64_neon_st2:
3272 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
3273 break;
3274 case Intrinsic::aarch64_neon_ld3:
3275 case Intrinsic::aarch64_neon_st3:
3276 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
3277 break;
3278 case Intrinsic::aarch64_neon_ld4:
3279 case Intrinsic::aarch64_neon_st4:
3280 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
3281 break;
3282 }
3283 return true;
3284}
3285
3286/// See if \p I should be considered for address type promotion. We check if \p
3287/// I is a sext with right type and used in memory accesses. If it used in a
3288/// "complex" getelementptr, we allow it to be promoted without finding other
3289/// sext instructions that sign extended the same initial value. A getelementptr
3290/// is considered as "complex" if it has more than 2 operands.
3292 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
3293 bool Considerable = false;
3294 AllowPromotionWithoutCommonHeader = false;
3295 if (!isa<SExtInst>(&I))
3296 return false;
3297 Type *ConsideredSExtType =
3298 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3299 if (I.getType() != ConsideredSExtType)
3300 return false;
3301 // See if the sext is the one with the right type and used in at least one
3302 // GetElementPtrInst.
3303 for (const User *U : I.users()) {
3304 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3305 Considerable = true;
3306 // A getelementptr is considered as "complex" if it has more than 2
3307 // operands. We will promote a SExt used in such complex GEP as we
3308 // expect some computation to be merged if they are done on 64 bits.
3309 if (GEPInst->getNumOperands() > 2) {
3310 AllowPromotionWithoutCommonHeader = true;
3311 break;
3312 }
3313 }
3314 }
3315 return Considerable;
3316}
3317
3319 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
3320 if (!VF.isScalable())
3321 return true;
3322
3323 Type *Ty = RdxDesc.getRecurrenceType();
3325 return false;
3326
3327 switch (RdxDesc.getRecurrenceKind()) {
3328 case RecurKind::Add:
3329 case RecurKind::FAdd:
3330 case RecurKind::And:
3331 case RecurKind::Or:
3332 case RecurKind::Xor:
3333 case RecurKind::SMin:
3334 case RecurKind::SMax:
3335 case RecurKind::UMin:
3336 case RecurKind::UMax:
3337 case RecurKind::FMin:
3338 case RecurKind::FMax:
3339 case RecurKind::FMulAdd:
3340 case RecurKind::IAnyOf:
3341 case RecurKind::FAnyOf:
3342 return true;
3343 default:
3344 return false;
3345 }
3346}
3347
3350 FastMathFlags FMF,
3352 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
3353
3354 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
3355 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
3356
3357 InstructionCost LegalizationCost = 0;
3358 if (LT.first > 1) {
3359 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
3360 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
3361 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
3362 }
3363
3364 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
3365}
3366
3368 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
3369 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3370 InstructionCost LegalizationCost = 0;
3371 if (LT.first > 1) {
3372 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
3373 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
3374 LegalizationCost *= LT.first - 1;
3375 }
3376
3377 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3378 assert(ISD && "Invalid opcode");
3379 // Add the final reduction cost for the legal horizontal reduction
3380 switch (ISD) {
3381 case ISD::ADD:
3382 case ISD::AND:
3383 case ISD::OR:
3384 case ISD::XOR:
3385 case ISD::FADD:
3386 return LegalizationCost + 2;
3387 default:
3389 }
3390}
3391
3394 std::optional<FastMathFlags> FMF,
3397 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
3398 InstructionCost BaseCost =
3399 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3400 // Add on extra cost to reflect the extra overhead on some CPUs. We still
3401 // end up vectorizing for more computationally intensive loops.
3402 return BaseCost + FixedVTy->getNumElements();
3403 }
3404
3405 if (Opcode != Instruction::FAdd)
3407
3408 auto *VTy = cast<ScalableVectorType>(ValTy);
3410 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
3411 Cost *= getMaxNumElements(VTy->getElementCount());
3412 return Cost;
3413 }
3414
3415 if (isa<ScalableVectorType>(ValTy))
3416 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
3417
3418 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3419 MVT MTy = LT.second;
3420 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3421 assert(ISD && "Invalid opcode");
3422
3423 // Horizontal adds can use the 'addv' instruction. We model the cost of these
3424 // instructions as twice a normal vector add, plus 1 for each legalization
3425 // step (LT.first). This is the only arithmetic vector reduction operation for
3426 // which we have an instruction.
3427 // OR, XOR and AND costs should match the codegen from:
3428 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
3429 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
3430 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
3431 static const CostTblEntry CostTblNoPairwise[]{
3432 {ISD::ADD, MVT::v8i8, 2},
3433 {ISD::ADD, MVT::v16i8, 2},
3434 {ISD::ADD, MVT::v4i16, 2},
3435 {ISD::ADD, MVT::v8i16, 2},
3436 {ISD::ADD, MVT::v4i32, 2},
3437 {ISD::ADD, MVT::v2i64, 2},
3438 {ISD::OR, MVT::v8i8, 15},
3439 {ISD::OR, MVT::v16i8, 17},
3440 {ISD::OR, MVT::v4i16, 7},
3441 {ISD::OR, MVT::v8i16, 9},
3442 {ISD::OR, MVT::v2i32, 3},
3443 {ISD::OR, MVT::v4i32, 5},
3444 {ISD::OR, MVT::v2i64, 3},
3445 {ISD::XOR, MVT::v8i8, 15},
3446 {ISD::XOR, MVT::v16i8, 17},
3447 {ISD::XOR, MVT::v4i16, 7},
3448 {ISD::XOR, MVT::v8i16, 9},
3449 {ISD::XOR, MVT::v2i32, 3},
3450 {ISD::XOR, MVT::v4i32, 5},
3451 {ISD::XOR, MVT::v2i64, 3},
3452 {ISD::AND, MVT::v8i8, 15},
3453 {ISD::AND, MVT::v16i8, 17},
3454 {ISD::AND, MVT::v4i16, 7},
3455 {ISD::AND, MVT::v8i16, 9},
3456 {ISD::AND, MVT::v2i32, 3},
3457 {ISD::AND, MVT::v4i32, 5},
3458 {ISD::AND, MVT::v2i64, 3},
3459 };
3460 switch (ISD) {
3461 default:
3462 break;
3463 case ISD::ADD:
3464 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
3465 return (LT.first - 1) + Entry->Cost;
3466 break;
3467 case ISD::XOR:
3468 case ISD::AND:
3469 case ISD::OR:
3470 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
3471 if (!Entry)
3472 break;
3473 auto *ValVTy = cast<FixedVectorType>(ValTy);
3474 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
3475 isPowerOf2_32(ValVTy->getNumElements())) {
3476 InstructionCost ExtraCost = 0;
3477 if (LT.first != 1) {
3478 // Type needs to be split, so there is an extra cost of LT.first - 1
3479 // arithmetic ops.
3480 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
3481 MTy.getVectorNumElements());
3482 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3483 ExtraCost *= LT.first - 1;
3484 }
3485 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
3486 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
3487 return Cost + ExtraCost;
3488 }
3489 break;
3490 }
3491 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3492}
3493
3495 static const CostTblEntry ShuffleTbl[] = {
3496 { TTI::SK_Splice, MVT::nxv16i8, 1 },
3497 { TTI::SK_Splice, MVT::nxv8i16, 1 },
3498 { TTI::SK_Splice, MVT::nxv4i32, 1 },
3499 { TTI::SK_Splice, MVT::nxv2i64, 1 },
3500 { TTI::SK_Splice, MVT::nxv2f16, 1 },
3501 { TTI::SK_Splice, MVT::nxv4f16, 1 },
3502 { TTI::SK_Splice, MVT::nxv8f16, 1 },
3503 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
3504 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
3505 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
3506 { TTI::SK_Splice, MVT::nxv2f32, 1 },
3507 { TTI::SK_Splice, MVT::nxv4f32, 1 },
3508 { TTI::SK_Splice, MVT::nxv2f64, 1 },
3509 };
3510
3511 // The code-generator is currently not able to handle scalable vectors
3512 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3513 // it. This change will be removed when code-generation for these types is
3514 // sufficiently reliable.
3517
3518 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3519 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
3521 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
3522 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
3523 : LT.second;
3524 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
3525 InstructionCost LegalizationCost = 0;
3526 if (Index < 0) {
3527 LegalizationCost =
3528 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
3530 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
3532 }
3533
3534 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
3535 // Cost performed on a promoted type.
3536 if (LT.second.getScalarType() == MVT::i1) {
3537 LegalizationCost +=
3538 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
3540 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
3542 }
3543 const auto *Entry =
3544 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
3545 assert(Entry && "Illegal Type for Splice");
3546 LegalizationCost += Entry->Cost;
3547 return LegalizationCost * LT.first;
3548}
3549
3551 VectorType *Tp,
3552 ArrayRef<int> Mask,
3554 int Index, VectorType *SubTp,
3556 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3557 // If we have a Mask, and the LT is being legalized somehow, split the Mask
3558 // into smaller vectors and sum the cost of each shuffle.
3559 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
3560 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
3561 cast<FixedVectorType>(Tp)->getNumElements() >
3562 LT.second.getVectorNumElements() &&
3563 !Index && !SubTp) {
3564 unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements();
3565 assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!");
3566 unsigned LTNumElts = LT.second.getVectorNumElements();
3567 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
3568 VectorType *NTp =
3569 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
3571 for (unsigned N = 0; N < NumVecs; N++) {
3572 SmallVector<int> NMask;
3573 // Split the existing mask into chunks of size LTNumElts. Track the source
3574 // sub-vectors to ensure the result has at most 2 inputs.
3575 unsigned Source1, Source2;
3576 unsigned NumSources = 0;
3577 for (unsigned E = 0; E < LTNumElts; E++) {
3578 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
3580 if (MaskElt < 0) {
3582 continue;
3583 }
3584
3585 // Calculate which source from the input this comes from and whether it
3586 // is new to us.
3587 unsigned Source = MaskElt / LTNumElts;
3588 if (NumSources == 0) {
3589 Source1 = Source;
3590 NumSources = 1;
3591 } else if (NumSources == 1 && Source != Source1) {
3592 Source2 = Source;
3593 NumSources = 2;
3594 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
3595 NumSources++;
3596 }
3597
3598 // Add to the new mask. For the NumSources>2 case these are not correct,
3599 // but are only used for the modular lane number.
3600 if (Source == Source1)
3601 NMask.push_back(MaskElt % LTNumElts);
3602 else if (Source == Source2)
3603 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
3604 else
3605 NMask.push_back(MaskElt % LTNumElts);
3606 }
3607 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
3608 // getShuffleCost. If not then cost it using the worst case.
3609 if (NumSources <= 2)
3610 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
3612 NTp, NMask, CostKind, 0, nullptr, Args);
3613 else if (any_of(enumerate(NMask), [&](const auto &ME) {
3614 return ME.value() % LTNumElts == ME.index();
3615 }))
3616 Cost += LTNumElts - 1;
3617 else
3618 Cost += LTNumElts;
3619 }
3620 return Cost;
3621 }
3622
3623 Kind = improveShuffleKindFromMask(Kind, Mask, Tp, Index, SubTp);
3624
3625 // Check for broadcast loads, which are supported by the LD1R instruction.
3626 // In terms of code-size, the shuffle vector is free when a load + dup get
3627 // folded into a LD1R. That's what we check and return here. For performance
3628 // and reciprocal throughput, a LD1R is not completely free. In this case, we
3629 // return the cost for the broadcast below (i.e. 1 for most/all types), so
3630 // that we model the load + dup sequence slightly higher because LD1R is a
3631 // high latency instruction.
3632 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
3633 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
3634 if (IsLoad && LT.second.isVector() &&
3636 LT.second.getVectorElementCount()))
3637 return 0;
3638 }
3639
3640 // If we have 4 elements for the shuffle and a Mask, get the cost straight
3641 // from the perfect shuffle tables.
3642 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
3643 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
3644 all_of(Mask, [](int E) { return E < 8; }))
3645 return getPerfectShuffleCost(Mask);
3646
3647 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
3648 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
3649 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
3650 static const CostTblEntry ShuffleTbl[] = {
3651 // Broadcast shuffle kinds can be performed with 'dup'.
3652 {TTI::SK_Broadcast, MVT::v8i8, 1},
3653 {TTI::SK_Broadcast, MVT::v16i8, 1},
3654 {TTI::SK_Broadcast, MVT::v4i16, 1},
3655 {TTI::SK_Broadcast, MVT::v8i16, 1},
3656 {TTI::SK_Broadcast, MVT::v2i32, 1},
3657 {TTI::SK_Broadcast, MVT::v4i32, 1},
3658 {TTI::SK_Broadcast, MVT::v2i64, 1},
3659 {TTI::SK_Broadcast, MVT::v4f16, 1},
3660 {TTI::SK_Broadcast, MVT::v8f16, 1},
3661 {TTI::SK_Broadcast, MVT::v2f32, 1},
3662 {TTI::SK_Broadcast, MVT::v4f32, 1},
3663 {TTI::SK_Broadcast, MVT::v2f64, 1},
3664 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
3665 // 'zip1/zip2' instructions.
3666 {TTI::SK_Transpose, MVT::v8i8, 1},
3667 {TTI::SK_Transpose, MVT::v16i8, 1},
3668 {TTI::SK_Transpose, MVT::v4i16, 1},
3669 {TTI::SK_Transpose, MVT::v8i16, 1},
3670 {TTI::SK_Transpose, MVT::v2i32, 1},
3671 {TTI::SK_Transpose, MVT::v4i32, 1},
3672 {TTI::SK_Transpose, MVT::v2i64, 1},
3673 {TTI::SK_Transpose, MVT::v4f16, 1},
3674 {TTI::SK_Transpose, MVT::v8f16, 1},
3675 {TTI::SK_Transpose, MVT::v2f32, 1},
3676 {TTI::SK_Transpose, MVT::v4f32, 1},
3677 {TTI::SK_Transpose, MVT::v2f64, 1},
3678 // Select shuffle kinds.
3679 // TODO: handle vXi8/vXi16.
3680 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
3681 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
3682 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
3683 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
3684 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
3685 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
3686 // PermuteSingleSrc shuffle kinds.
3687 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
3688 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
3689 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
3690 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
3691 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
3692 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
3693 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
3694 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
3695 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
3696 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
3697 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
3698 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
3699 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
3700 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
3701 // Reverse can be lowered with `rev`.
3702 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
3703 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
3704 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
3705 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
3706 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
3707 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
3708 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
3709 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
3710 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
3711 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
3712 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
3713 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
3714 // Splice can all be lowered as `ext`.
3715 {TTI::SK_Splice, MVT::v2i32, 1},
3716 {TTI::SK_Splice, MVT::v4i32, 1},
3717 {TTI::SK_Splice, MVT::v2i64, 1},
3718 {TTI::SK_Splice, MVT::v2f32, 1},
3719 {TTI::SK_Splice, MVT::v4f32, 1},
3720 {TTI::SK_Splice, MVT::v2f64, 1},
3721 {TTI::SK_Splice, MVT::v8f16, 1},
3722 {TTI::SK_Splice, MVT::v8bf16, 1},
3723 {TTI::SK_Splice, MVT::v8i16, 1},
3724 {TTI::SK_Splice, MVT::v16i8, 1},
3725 {TTI::SK_Splice, MVT::v4bf16, 1},
3726 {TTI::SK_Splice, MVT::v4f16, 1},
3727 {TTI::SK_Splice, MVT::v4i16, 1},
3728 {TTI::SK_Splice, MVT::v8i8, 1},
3729 // Broadcast shuffle kinds for scalable vectors
3730 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
3731 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
3732 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
3733 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
3734 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
3735 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
3736 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
3737 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
3738 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
3739 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
3740 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
3741 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
3742 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
3743 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
3744 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
3745 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
3746 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
3747 // Handle the cases for vector.reverse with scalable vectors
3748 {TTI::SK_Reverse, MVT::nxv16i8, 1},
3749 {TTI::SK_Reverse, MVT::nxv8i16, 1},
3750 {TTI::SK_Reverse, MVT::nxv4i32, 1},
3751 {TTI::SK_Reverse, MVT::nxv2i64, 1},
3752 {TTI::SK_Reverse, MVT::nxv2f16, 1},
3753 {TTI::SK_Reverse, MVT::nxv4f16, 1},
3754 {TTI::SK_Reverse, MVT::nxv8f16, 1},
3755 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
3756 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
3757 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
3758 {TTI::SK_Reverse, MVT::nxv2f32, 1},
3759 {TTI::SK_Reverse, MVT::nxv4f32, 1},
3760 {TTI::SK_Reverse, MVT::nxv2f64, 1},
3761 {TTI::SK_Reverse, MVT::nxv16i1, 1},
3762 {TTI::SK_Reverse, MVT::nxv8i1, 1},
3763 {TTI::SK_Reverse, MVT::nxv4i1, 1},
3764 {TTI::SK_Reverse, MVT::nxv2i1, 1},
3765 };
3766 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
3767 return LT.first * Entry->Cost;
3768 }
3769
3770 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
3771 return getSpliceCost(Tp, Index);
3772
3773 // Inserting a subvector can often be done with either a D, S or H register
3774 // move, so long as the inserted vector is "aligned".
3775 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
3776 LT.second.getSizeInBits() <= 128 && SubTp) {
3777 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
3778 if (SubLT.second.isVector()) {
3779 int NumElts = LT.second.getVectorNumElements();
3780 int NumSubElts = SubLT.second.getVectorNumElements();
3781 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
3782 return SubLT.first;
3783 }
3784 }
3785
3786 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
3787}
3788
3791 const auto &Strides = DenseMap<Value *, const SCEV *>();
3792 for (BasicBlock *BB : TheLoop->blocks()) {
3793 // Scan the instructions in the block and look for addresses that are
3794 // consecutive and decreasing.
3795 for (Instruction &I : *BB) {
3796 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
3798 Type *AccessTy = getLoadStoreType(&I);
3799 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, Strides, /*Assume=*/true,
3800 /*ShouldCheckWrap=*/false)
3801 .value_or(0) < 0)
3802 return true;
3803 }
3804 }
3805 }
3806 return false;
3807}
3808
3810 if (!ST->hasSVE())
3811 return false;
3812
3813 // We don't currently support vectorisation with interleaving for SVE - with
3814 // such loops we're better off not using tail-folding. This gives us a chance
3815 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
3816 if (TFI->IAI->hasGroups())
3817 return false;
3818
3820 if (TFI->LVL->getReductionVars().size())
3821 Required |= TailFoldingOpts::Reductions;
3822 if (TFI->LVL->getFixedOrderRecurrences().size())
3823 Required |= TailFoldingOpts::Recurrences;
3824
3825 // We call this to discover whether any load/store pointers in the loop have
3826 // negative strides. This will require extra work to reverse the loop
3827 // predicate, which may be expensive.
3830 Required |= TailFoldingOpts::Reverse;
3831 if (Required == TailFoldingOpts::Disabled)
3832 Required |= TailFoldingOpts::Simple;
3833
3835 Required))
3836 return false;
3837
3838 // Don't tail-fold for tight loops where we would be better off interleaving
3839 // with an unpredicated loop.
3840 unsigned NumInsns = 0;
3841 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
3842 NumInsns += BB->sizeWithoutDebug();
3843 }
3844
3845 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
3846 return NumInsns >= SVETailFoldInsnThreshold;
3847}
3848
3851 int64_t BaseOffset, bool HasBaseReg,
3852 int64_t Scale, unsigned AddrSpace) const {
3853 // Scaling factors are not free at all.
3854 // Operands | Rt Latency
3855 // -------------------------------------------
3856 // Rt, [Xn, Xm] | 4
3857 // -------------------------------------------
3858 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
3859 // Rt, [Xn, Wm, <extend> #imm] |
3861 AM.BaseGV = BaseGV;
3862 AM.BaseOffs = BaseOffset;
3863 AM.HasBaseReg = HasBaseReg;
3864 AM.Scale = Scale;
3865 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
3866 // Scale represents reg2 * scale, thus account for 1 if
3867 // it is not equal to 0 or 1.
3868 return AM.Scale != 0 && AM.Scale != 1;
3869 return -1;
3870}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
static unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static std::optional< Instruction * > instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II, Intrinsic::ID IID)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE)
static std::optional< Instruction * > instCombineSVEAllActive(IntrinsicInst &II, Intrinsic::ID IID)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfo::Concept conforming object specific to the AArch64 target machine.
amdgpu AMDGPU Register Bank Select
assume Assume Builder
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
IntegerType * Int32Ty
#define P(N)
const char LLVMTargetMachineRef TM
static uint64_t getBits(uint64_t Val, int Start, int End)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition: TapiFile.cpp:40
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getVectorInsertExtractBaseCost() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned getMaxInterleaveFactor() const
TailFoldingOpts getSVETailFoldingDefaultOpts() const
bool useSVEForFixedLengthVectors() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
unsigned getMinSVEVectorSizeInBits() const
InstructionCost getSpliceCost(VectorType *Tp, int Index)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isElementTypeLegalForScalableVector(Type *Ty) const
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool useNeonVector(const Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
unsigned getMaxInterleaveFactor(ElementCount VF)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
Class for arbitrary precision integers.
Definition: APInt.h:76
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:427
unsigned popcount() const
Count the number of bits set.
Definition: APInt.h:1614
unsigned countLeadingOnes() const
Definition: APInt.h:1568
void negate()
Negate this APInt in place.
Definition: APInt.h:1415
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1010
unsigned logBase2() const
Definition: APInt.h:1696
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:805
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:418
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1507
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:547
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:856
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:934
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
Definition: BasicTTIImpl.h:978
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:727
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr)
Definition: BasicTTIImpl.h:334
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:619
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:820
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", Instruction *InsertBefore=nullptr)
Definition: InstrTypes.h:248
bool isStrictFP() const
Determine if the call requires strict floating point semantics.
Definition: InstrTypes.h:1882
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1357
unsigned arg_size() const
Definition: InstrTypes.h:1355
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1451
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:711
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:714
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:717
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:715
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:716
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:718
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:727
bool isIntPredicate() const
Definition: InstrTypes.h:819
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1579
This is the shared class of boolean and integer constants.
Definition: Constants.h:78
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:888
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:136
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:356
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:294
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:291
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
bool allowContract() const
Definition: FMF.h:70
Container class for subtarget features.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:693
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:940
bool isEquality() const
Return true if this predicate is either EQ or NE.
Value * CreateVScale(Constant *Scaling, const Twine &Name="")
Create a call to llvm.vscale, multiplied by Scaling.
Definition: IRBuilder.cpp:88
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2434
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition: IRBuilder.h:1036
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition: IRBuilder.h:2422
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition: IRBuilder.h:525
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition: IRBuilder.h:545
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Definition: IRBuilder.cpp:1223
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Definition: IRBuilder.cpp:941
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Definition: IRBuilder.cpp:589
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Definition: IRBuilder.cpp:1119
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition: IRBuilder.h:512
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition: IRBuilder.h:530
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition: IRBuilder.h:297
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition: IRBuilder.h:517
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition: IRBuilder.h:477
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2167
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition: IRBuilder.h:2359
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition: IRBuilder.h:2089
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition: IRBuilder.h:1786
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition: IRBuilder.h:2456
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition: IRBuilder.h:1799
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Definition: IRBuilder.cpp:609
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition: IRBuilder.h:540
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition: IRBuilder.h:1657
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition: IRBuilder.h:2158
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition: IRBuilder.h:180
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
Definition: IRBuilder.h:1862
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2628
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
The core instruction combiner logic.
Definition: InstCombiner.h:46
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:424
BuilderTy & Builder
Definition: InstCombiner.h:59
static InstructionCost getInvalid(CostType Val=0)
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:71
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
Definition: Instruction.h:195
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
bool hasGroups() const
Returns true if we have any interleave groups.
Definition: VectorUtils.h:841
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:177
Value * getPointerOperand()
Definition: Instructions.h:264
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:47
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition: MapVector.h:60
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:254
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:659
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1743
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:72
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
std::optional< bool > requiresSMChange(const SMEAttrs &Callee, bool BodyOverridesInterface=false) const
bool requiresLazySave(const SMEAttrs &Callee) const
bool hasNewZABody() const
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:714
The main scalar evolution driver.
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
size_type size() const
Definition: SmallPtrSet.h:93
bool empty() const
Definition: SmallVector.h:94
size_t size() const
Definition: SmallVector.h:91
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:809
void resize(size_type N)
Definition: SmallVector.h:642
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
An instruction for storing to memory.
Definition: Instructions.h:301
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:704
Class to represent struct types.
Definition: DerivedTypes.h:213
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
bool isLoweredToCall(const Function *F) const
static OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ None
The cast is not used with a load/store of any kind.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:322
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:325
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:265
static IntegerType * getInt1Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:163
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition: Type.h:143
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition: Type.h:185
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:262
bool isScalableTy() const
Return true if this is a type whose size is a known multiple of vscale.
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:228
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:216
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:348
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:921
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1069
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:384
Base class of all SIMD vector types.
Definition: DerivedTypes.h:400
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:638
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:677
Type * getElementType() const
Definition: DerivedTypes.h:433
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:166
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static constexpr unsigned SVEBitsPerBlock
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition: ISDOpcodes.h:750
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:787
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:900
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:774
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:925