LLVM 17.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
20#include "llvm/IR/Intrinsics.h"
21#include "llvm/IR/IntrinsicsAArch64.h"
23#include "llvm/Support/Debug.h"
26#include <algorithm>
27#include <optional>
28using namespace llvm;
29using namespace llvm::PatternMatch;
30
31#define DEBUG_TYPE "aarch64tti"
32
33static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
34 cl::init(true), cl::Hidden);
35
36static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
38
39static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
40 cl::init(10), cl::Hidden);
41
42namespace {
43class TailFoldingKind {
44private:
45 uint8_t Bits = 0; // Currently defaults to disabled.
46
47public:
48 enum TailFoldingOpts {
49 TFDisabled = 0x0,
50 TFReductions = 0x01,
51 TFRecurrences = 0x02,
52 TFSimple = 0x80,
53 TFAll = TFReductions | TFRecurrences | TFSimple
54 };
55
56 void operator=(const std::string &Val) {
57 if (Val.empty())
58 return;
59 SmallVector<StringRef, 6> TailFoldTypes;
60 StringRef(Val).split(TailFoldTypes, '+', -1, false);
61 for (auto TailFoldType : TailFoldTypes) {
62 if (TailFoldType == "disabled")
63 Bits = 0;
64 else if (TailFoldType == "all")
65 Bits = TFAll;
66 else if (TailFoldType == "default")
67 Bits = 0; // Currently defaults to never tail-folding.
68 else if (TailFoldType == "simple")
69 add(TFSimple);
70 else if (TailFoldType == "reductions")
71 add(TFReductions);
72 else if (TailFoldType == "recurrences")
73 add(TFRecurrences);
74 else if (TailFoldType == "noreductions")
75 remove(TFReductions);
76 else if (TailFoldType == "norecurrences")
77 remove(TFRecurrences);
78 else {
79 errs()
80 << "invalid argument " << TailFoldType.str()
81 << " to -sve-tail-folding=; each element must be one of: disabled, "
82 "all, default, simple, reductions, noreductions, recurrences, "
83 "norecurrences\n";
84 }
85 }
86 }
87
88 operator uint8_t() const { return Bits; }
89
90 void add(uint8_t Flag) { Bits |= Flag; }
91 void remove(uint8_t Flag) { Bits &= ~Flag; }
92};
93} // namespace
94
95TailFoldingKind TailFoldingKindLoc;
96
98 "sve-tail-folding",
100 "Control the use of vectorisation using tail-folding for SVE:"
101 "\ndisabled No loop types will vectorize using tail-folding"
102 "\ndefault Uses the default tail-folding settings for the target "
103 "CPU"
104 "\nall All legal loop types will vectorize using tail-folding"
105 "\nsimple Use tail-folding for simple loops (not reductions or "
106 "recurrences)"
107 "\nreductions Use tail-folding for loops containing reductions"
108 "\nrecurrences Use tail-folding for loops containing fixed order "
109 "recurrences"),
111
112// Experimental option that will only be fully functional when the
113// code-generator is changed to use SVE instead of NEON for all fixed-width
114// operations.
116 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
117
118// Experimental option that will only be fully functional when the cost-model
119// and code-generator have been changed to avoid using scalable vector
120// instructions that are not legal in streaming SVE mode.
122 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
123
125 const Function *Callee) const {
126 SMEAttrs CallerAttrs(*Caller);
127 SMEAttrs CalleeAttrs(*Callee);
128 if (CallerAttrs.requiresSMChange(CalleeAttrs,
129 /*BodyOverridesInterface=*/true) ||
130 CallerAttrs.requiresLazySave(CalleeAttrs) ||
131 CalleeAttrs.hasNewZAInterface())
132 return false;
133
134 const TargetMachine &TM = getTLI()->getTargetMachine();
135
136 const FeatureBitset &CallerBits =
137 TM.getSubtargetImpl(*Caller)->getFeatureBits();
138 const FeatureBitset &CalleeBits =
139 TM.getSubtargetImpl(*Callee)->getFeatureBits();
140
141 // Inline a callee if its target-features are a subset of the callers
142 // target-features.
143 return (CallerBits & CalleeBits) == CalleeBits;
144}
145
150}
151
152/// Calculate the cost of materializing a 64-bit value. This helper
153/// method might only calculate a fraction of a larger immediate. Therefore it
154/// is valid to return a cost of ZERO.
156 // Check if the immediate can be encoded within an instruction.
157 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
158 return 0;
159
160 if (Val < 0)
161 Val = ~Val;
162
163 // Calculate how many moves we will need to materialize this constant.
166 return Insn.size();
167}
168
169/// Calculate the cost of materializing the given constant.
172 assert(Ty->isIntegerTy());
173
174 unsigned BitSize = Ty->getPrimitiveSizeInBits();
175 if (BitSize == 0)
176 return ~0U;
177
178 // Sign-extend all constants to a multiple of 64-bit.
179 APInt ImmVal = Imm;
180 if (BitSize & 0x3f)
181 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
182
183 // Split the constant into 64-bit chunks and calculate the cost for each
184 // chunk.
186 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
187 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
188 int64_t Val = Tmp.getSExtValue();
189 Cost += getIntImmCost(Val);
190 }
191 // We need at least one instruction to materialze the constant.
192 return std::max<InstructionCost>(1, Cost);
193}
194
196 const APInt &Imm, Type *Ty,
198 Instruction *Inst) {
199 assert(Ty->isIntegerTy());
200
201 unsigned BitSize = Ty->getPrimitiveSizeInBits();
202 // There is no cost model for constants with a bit size of 0. Return TCC_Free
203 // here, so that constant hoisting will ignore this constant.
204 if (BitSize == 0)
205 return TTI::TCC_Free;
206
207 unsigned ImmIdx = ~0U;
208 switch (Opcode) {
209 default:
210 return TTI::TCC_Free;
211 case Instruction::GetElementPtr:
212 // Always hoist the base address of a GetElementPtr.
213 if (Idx == 0)
214 return 2 * TTI::TCC_Basic;
215 return TTI::TCC_Free;
216 case Instruction::Store:
217 ImmIdx = 0;
218 break;
219 case Instruction::Add:
220 case Instruction::Sub:
221 case Instruction::Mul:
222 case Instruction::UDiv:
223 case Instruction::SDiv:
224 case Instruction::URem:
225 case Instruction::SRem:
226 case Instruction::And:
227 case Instruction::Or:
228 case Instruction::Xor:
229 case Instruction::ICmp:
230 ImmIdx = 1;
231 break;
232 // Always return TCC_Free for the shift value of a shift instruction.
233 case Instruction::Shl:
234 case Instruction::LShr:
235 case Instruction::AShr:
236 if (Idx == 1)
237 return TTI::TCC_Free;
238 break;
239 case Instruction::Trunc:
240 case Instruction::ZExt:
241 case Instruction::SExt:
242 case Instruction::IntToPtr:
243 case Instruction::PtrToInt:
244 case Instruction::BitCast:
245 case Instruction::PHI:
246 case Instruction::Call:
247 case Instruction::Select:
248 case Instruction::Ret:
249 case Instruction::Load:
250 break;
251 }
252
253 if (Idx == ImmIdx) {
254 int NumConstants = (BitSize + 63) / 64;
256 return (Cost <= NumConstants * TTI::TCC_Basic)
257 ? static_cast<int>(TTI::TCC_Free)
258 : Cost;
259 }
261}
262
265 const APInt &Imm, Type *Ty,
267 assert(Ty->isIntegerTy());
268
269 unsigned BitSize = Ty->getPrimitiveSizeInBits();
270 // There is no cost model for constants with a bit size of 0. Return TCC_Free
271 // here, so that constant hoisting will ignore this constant.
272 if (BitSize == 0)
273 return TTI::TCC_Free;
274
275 // Most (all?) AArch64 intrinsics do not support folding immediates into the
276 // selected instruction, so we compute the materialization cost for the
277 // immediate directly.
278 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
280
281 switch (IID) {
282 default:
283 return TTI::TCC_Free;
284 case Intrinsic::sadd_with_overflow:
285 case Intrinsic::uadd_with_overflow:
286 case Intrinsic::ssub_with_overflow:
287 case Intrinsic::usub_with_overflow:
288 case Intrinsic::smul_with_overflow:
289 case Intrinsic::umul_with_overflow:
290 if (Idx == 1) {
291 int NumConstants = (BitSize + 63) / 64;
293 return (Cost <= NumConstants * TTI::TCC_Basic)
294 ? static_cast<int>(TTI::TCC_Free)
295 : Cost;
296 }
297 break;
298 case Intrinsic::experimental_stackmap:
299 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
300 return TTI::TCC_Free;
301 break;
302 case Intrinsic::experimental_patchpoint_void:
303 case Intrinsic::experimental_patchpoint_i64:
304 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
305 return TTI::TCC_Free;
306 break;
307 case Intrinsic::experimental_gc_statepoint:
308 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
309 return TTI::TCC_Free;
310 break;
311 }
313}
314
317 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
318 if (TyWidth == 32 || TyWidth == 64)
320 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
321 return TTI::PSK_Software;
322}
323
327 auto *RetTy = ICA.getReturnType();
328 switch (ICA.getID()) {
329 case Intrinsic::umin:
330 case Intrinsic::umax:
331 case Intrinsic::smin:
332 case Intrinsic::smax: {
333 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
336 // v2i64 types get converted to cmp+bif hence the cost of 2
337 if (LT.second == MVT::v2i64)
338 return LT.first * 2;
339 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }))
340 return LT.first;
341 break;
342 }
343 case Intrinsic::sadd_sat:
344 case Intrinsic::ssub_sat:
345 case Intrinsic::uadd_sat:
346 case Intrinsic::usub_sat: {
347 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
349 MVT::v2i64};
351 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
352 // need to extend the type, as it uses shr(qadd(shl, shl)).
353 unsigned Instrs =
354 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
355 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
356 return LT.first * Instrs;
357 break;
358 }
359 case Intrinsic::abs: {
360 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
362 MVT::v2i64};
364 if (any_of(ValidAbsTys, [&LT](MVT M) { return M == LT.second; }))
365 return LT.first;
366 break;
367 }
368 case Intrinsic::experimental_stepvector: {
369 InstructionCost Cost = 1; // Cost of the `index' instruction
371 // Legalisation of illegal vectors involves an `index' instruction plus
372 // (LT.first - 1) vector adds.
373 if (LT.first > 1) {
374 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
375 InstructionCost AddCost =
376 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
377 Cost += AddCost * (LT.first - 1);
378 }
379 return Cost;
380 }
381 case Intrinsic::bitreverse: {
382 static const CostTblEntry BitreverseTbl[] = {
383 {Intrinsic::bitreverse, MVT::i32, 1},
384 {Intrinsic::bitreverse, MVT::i64, 1},
385 {Intrinsic::bitreverse, MVT::v8i8, 1},
386 {Intrinsic::bitreverse, MVT::v16i8, 1},
387 {Intrinsic::bitreverse, MVT::v4i16, 2},
388 {Intrinsic::bitreverse, MVT::v8i16, 2},
389 {Intrinsic::bitreverse, MVT::v2i32, 2},
390 {Intrinsic::bitreverse, MVT::v4i32, 2},
391 {Intrinsic::bitreverse, MVT::v1i64, 2},
392 {Intrinsic::bitreverse, MVT::v2i64, 2},
393 };
394 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
395 const auto *Entry =
396 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
397 if (Entry) {
398 // Cost Model is using the legal type(i32) that i8 and i16 will be
399 // converted to +1 so that we match the actual lowering cost
400 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
401 TLI->getValueType(DL, RetTy, true) == MVT::i16)
402 return LegalisationCost.first * Entry->Cost + 1;
403
404 return LegalisationCost.first * Entry->Cost;
405 }
406 break;
407 }
408 case Intrinsic::ctpop: {
409 if (!ST->hasNEON()) {
410 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
411 return getTypeLegalizationCost(RetTy).first * 12;
412 }
413 static const CostTblEntry CtpopCostTbl[] = {
418 {ISD::CTPOP, MVT::i64, 4},
421 {ISD::CTPOP, MVT::v8i8, 1},
422 {ISD::CTPOP, MVT::i32, 5},
423 };
425 MVT MTy = LT.second;
426 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
427 // Extra cost of +1 when illegal vector types are legalized by promoting
428 // the integer type.
429 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
430 RetTy->getScalarSizeInBits()
431 ? 1
432 : 0;
433 return LT.first * Entry->Cost + ExtraCost;
434 }
435 break;
436 }
437 case Intrinsic::sadd_with_overflow:
438 case Intrinsic::uadd_with_overflow:
439 case Intrinsic::ssub_with_overflow:
440 case Intrinsic::usub_with_overflow:
441 case Intrinsic::smul_with_overflow:
442 case Intrinsic::umul_with_overflow: {
443 static const CostTblEntry WithOverflowCostTbl[] = {
444 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
445 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
446 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
447 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
448 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
449 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
450 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
451 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
452 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
453 {Intrinsic::usub_with_overflow, MVT::i8, 3},
454 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
455 {Intrinsic::usub_with_overflow, MVT::i16, 3},
456 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
457 {Intrinsic::usub_with_overflow, MVT::i32, 1},
458 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
459 {Intrinsic::usub_with_overflow, MVT::i64, 1},
460 {Intrinsic::smul_with_overflow, MVT::i8, 5},
461 {Intrinsic::umul_with_overflow, MVT::i8, 4},
462 {Intrinsic::smul_with_overflow, MVT::i16, 5},
463 {Intrinsic::umul_with_overflow, MVT::i16, 4},
464 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
465 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
466 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
467 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
468 };
469 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
470 if (MTy.isSimple())
471 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
472 MTy.getSimpleVT()))
473 return Entry->Cost;
474 break;
475 }
476 case Intrinsic::fptosi_sat:
477 case Intrinsic::fptoui_sat: {
478 if (ICA.getArgTypes().empty())
479 break;
480 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
481 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
482 EVT MTy = TLI->getValueType(DL, RetTy);
483 // Check for the legal types, which are where the size of the input and the
484 // output are the same, or we are using cvt f64->i32 or f32->i64.
485 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
486 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
487 LT.second == MVT::v2f64) &&
488 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
489 (LT.second == MVT::f64 && MTy == MVT::i32) ||
490 (LT.second == MVT::f32 && MTy == MVT::i64)))
491 return LT.first;
492 // Similarly for fp16 sizes
493 if (ST->hasFullFP16() &&
494 ((LT.second == MVT::f16 && MTy == MVT::i32) ||
495 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
496 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits()))))
497 return LT.first;
498
499 // Otherwise we use a legal convert followed by a min+max
500 if ((LT.second.getScalarType() == MVT::f32 ||
501 LT.second.getScalarType() == MVT::f64 ||
502 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
503 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
504 Type *LegalTy =
505 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
506 if (LT.second.isVector())
507 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
509 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin : Intrinsic::umin,
510 LegalTy, {LegalTy, LegalTy});
512 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax : Intrinsic::umax,
513 LegalTy, {LegalTy, LegalTy});
515 return LT.first * Cost;
516 }
517 break;
518 }
519 default:
520 break;
521 }
523}
524
525/// The function will remove redundant reinterprets casting in the presence
526/// of the control flow
527static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
528 IntrinsicInst &II) {
530 auto RequiredType = II.getType();
531
532 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
533 assert(PN && "Expected Phi Node!");
534
535 // Don't create a new Phi unless we can remove the old one.
536 if (!PN->hasOneUse())
537 return std::nullopt;
538
539 for (Value *IncValPhi : PN->incoming_values()) {
540 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
541 if (!Reinterpret ||
542 Reinterpret->getIntrinsicID() !=
543 Intrinsic::aarch64_sve_convert_to_svbool ||
544 RequiredType != Reinterpret->getArgOperand(0)->getType())
545 return std::nullopt;
546 }
547
548 // Create the new Phi
549 LLVMContext &Ctx = PN->getContext();
550 IRBuilder<> Builder(Ctx);
551 Builder.SetInsertPoint(PN);
552 PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
553 Worklist.push_back(PN);
554
555 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
556 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
557 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
558 Worklist.push_back(Reinterpret);
559 }
560
561 // Cleanup Phi Node and reinterprets
562 return IC.replaceInstUsesWith(II, NPN);
563}
564
565// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
566// => (binop (pred) (from_svbool _) (from_svbool _))
567//
568// The above transformation eliminates a `to_svbool` in the predicate
569// operand of bitwise operation `binop` by narrowing the vector width of
570// the operation. For example, it would convert a `<vscale x 16 x i1>
571// and` into a `<vscale x 4 x i1> and`. This is profitable because
572// to_svbool must zero the new lanes during widening, whereas
573// from_svbool is free.
574static std::optional<Instruction *>
576 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
577 if (!BinOp)
578 return std::nullopt;
579
580 auto IntrinsicID = BinOp->getIntrinsicID();
581 switch (IntrinsicID) {
582 case Intrinsic::aarch64_sve_and_z:
583 case Intrinsic::aarch64_sve_bic_z:
584 case Intrinsic::aarch64_sve_eor_z:
585 case Intrinsic::aarch64_sve_nand_z:
586 case Intrinsic::aarch64_sve_nor_z:
587 case Intrinsic::aarch64_sve_orn_z:
588 case Intrinsic::aarch64_sve_orr_z:
589 break;
590 default:
591 return std::nullopt;
592 }
593
594 auto BinOpPred = BinOp->getOperand(0);
595 auto BinOpOp1 = BinOp->getOperand(1);
596 auto BinOpOp2 = BinOp->getOperand(2);
597
598 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
599 if (!PredIntr ||
600 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
601 return std::nullopt;
602
603 auto PredOp = PredIntr->getOperand(0);
604 auto PredOpTy = cast<VectorType>(PredOp->getType());
605 if (PredOpTy != II.getType())
606 return std::nullopt;
607
609 Builder.SetInsertPoint(&II);
610
611 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
612 auto NarrowBinOpOp1 = Builder.CreateIntrinsic(
613 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
614 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
615 if (BinOpOp1 == BinOpOp2)
616 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
617 else
618 NarrowedBinOpArgs.push_back(Builder.CreateIntrinsic(
619 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
620
621 auto NarrowedBinOp =
622 Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
623 return IC.replaceInstUsesWith(II, NarrowedBinOp);
624}
625
626static std::optional<Instruction *>
628 // If the reinterpret instruction operand is a PHI Node
629 if (isa<PHINode>(II.getArgOperand(0)))
630 return processPhiNode(IC, II);
631
632 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
633 return BinOpCombine;
634
635 SmallVector<Instruction *, 32> CandidatesForRemoval;
636 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
637
638 const auto *IVTy = cast<VectorType>(II.getType());
639
640 // Walk the chain of conversions.
641 while (Cursor) {
642 // If the type of the cursor has fewer lanes than the final result, zeroing
643 // must take place, which breaks the equivalence chain.
644 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
645 if (CursorVTy->getElementCount().getKnownMinValue() <
646 IVTy->getElementCount().getKnownMinValue())
647 break;
648
649 // If the cursor has the same type as I, it is a viable replacement.
650 if (Cursor->getType() == IVTy)
651 EarliestReplacement = Cursor;
652
653 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
654
655 // If this is not an SVE conversion intrinsic, this is the end of the chain.
656 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
657 Intrinsic::aarch64_sve_convert_to_svbool ||
658 IntrinsicCursor->getIntrinsicID() ==
659 Intrinsic::aarch64_sve_convert_from_svbool))
660 break;
661
662 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
663 Cursor = IntrinsicCursor->getOperand(0);
664 }
665
666 // If no viable replacement in the conversion chain was found, there is
667 // nothing to do.
668 if (!EarliestReplacement)
669 return std::nullopt;
670
671 return IC.replaceInstUsesWith(II, EarliestReplacement);
672}
673
674static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
675 IntrinsicInst &II) {
676 IRBuilder<> Builder(&II);
677 auto Select = Builder.CreateSelect(II.getOperand(0), II.getOperand(1),
678 II.getOperand(2));
679 return IC.replaceInstUsesWith(II, Select);
680}
681
682static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
683 IntrinsicInst &II) {
684 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
685 if (!Pg)
686 return std::nullopt;
687
688 if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
689 return std::nullopt;
690
691 const auto PTruePattern =
692 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
693 if (PTruePattern != AArch64SVEPredPattern::vl1)
694 return std::nullopt;
695
696 // The intrinsic is inserting into lane zero so use an insert instead.
697 auto *IdxTy = Type::getInt64Ty(II.getContext());
698 auto *Insert = InsertElementInst::Create(
699 II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
700 Insert->insertBefore(&II);
701 Insert->takeName(&II);
702
703 return IC.replaceInstUsesWith(II, Insert);
704}
705
706static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
707 IntrinsicInst &II) {
708 // Replace DupX with a regular IR splat.
710 Builder.SetInsertPoint(&II);
711 auto *RetTy = cast<ScalableVectorType>(II.getType());
712 Value *Splat =
713 Builder.CreateVectorSplat(RetTy->getElementCount(), II.getArgOperand(0));
714 Splat->takeName(&II);
715 return IC.replaceInstUsesWith(II, Splat);
716}
717
718static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
719 IntrinsicInst &II) {
720 LLVMContext &Ctx = II.getContext();
721 IRBuilder<> Builder(Ctx);
722 Builder.SetInsertPoint(&II);
723
724 // Check that the predicate is all active
725 auto *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(0));
726 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
727 return std::nullopt;
728
729 const auto PTruePattern =
730 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
731 if (PTruePattern != AArch64SVEPredPattern::all)
732 return std::nullopt;
733
734 // Check that we have a compare of zero..
735 auto *SplatValue =
736 dyn_cast_or_null<ConstantInt>(getSplatValue(II.getArgOperand(2)));
737 if (!SplatValue || !SplatValue->isZero())
738 return std::nullopt;
739
740 // ..against a dupq
741 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
742 if (!DupQLane ||
743 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
744 return std::nullopt;
745
746 // Where the dupq is a lane 0 replicate of a vector insert
747 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
748 return std::nullopt;
749
750 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
751 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
752 return std::nullopt;
753
754 // Where the vector insert is a fixed constant vector insert into undef at
755 // index zero
756 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
757 return std::nullopt;
758
759 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
760 return std::nullopt;
761
762 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
763 if (!ConstVec)
764 return std::nullopt;
765
766 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
767 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
768 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
769 return std::nullopt;
770
771 unsigned NumElts = VecTy->getNumElements();
772 unsigned PredicateBits = 0;
773
774 // Expand intrinsic operands to a 16-bit byte level predicate
775 for (unsigned I = 0; I < NumElts; ++I) {
776 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
777 if (!Arg)
778 return std::nullopt;
779 if (!Arg->isZero())
780 PredicateBits |= 1 << (I * (16 / NumElts));
781 }
782
783 // If all bits are zero bail early with an empty predicate
784 if (PredicateBits == 0) {
785 auto *PFalse = Constant::getNullValue(II.getType());
786 PFalse->takeName(&II);
787 return IC.replaceInstUsesWith(II, PFalse);
788 }
789
790 // Calculate largest predicate type used (where byte predicate is largest)
791 unsigned Mask = 8;
792 for (unsigned I = 0; I < 16; ++I)
793 if ((PredicateBits & (1 << I)) != 0)
794 Mask |= (I % 8);
795
796 unsigned PredSize = Mask & -Mask;
797 auto *PredType = ScalableVectorType::get(
798 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
799
800 // Ensure all relevant bits are set
801 for (unsigned I = 0; I < 16; I += PredSize)
802 if ((PredicateBits & (1 << I)) == 0)
803 return std::nullopt;
804
805 auto *PTruePat =
806 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
807 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
808 {PredType}, {PTruePat});
809 auto *ConvertToSVBool = Builder.CreateIntrinsic(
810 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
811 auto *ConvertFromSVBool =
812 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
813 {II.getType()}, {ConvertToSVBool});
814
815 ConvertFromSVBool->takeName(&II);
816 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
817}
818
819static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
820 IntrinsicInst &II) {
822 Builder.SetInsertPoint(&II);
823 Value *Pg = II.getArgOperand(0);
824 Value *Vec = II.getArgOperand(1);
825 auto IntrinsicID = II.getIntrinsicID();
826 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
827
828 // lastX(splat(X)) --> X
829 if (auto *SplatVal = getSplatValue(Vec))
830 return IC.replaceInstUsesWith(II, SplatVal);
831
832 // If x and/or y is a splat value then:
833 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
834 Value *LHS, *RHS;
835 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
836 if (isSplatValue(LHS) || isSplatValue(RHS)) {
837 auto *OldBinOp = cast<BinaryOperator>(Vec);
838 auto OpC = OldBinOp->getOpcode();
839 auto *NewLHS =
840 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
841 auto *NewRHS =
842 Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
844 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), &II);
845 return IC.replaceInstUsesWith(II, NewBinOp);
846 }
847 }
848
849 auto *C = dyn_cast<Constant>(Pg);
850 if (IsAfter && C && C->isNullValue()) {
851 // The intrinsic is extracting lane 0 so use an extract instead.
852 auto *IdxTy = Type::getInt64Ty(II.getContext());
853 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
854 Extract->insertBefore(&II);
855 Extract->takeName(&II);
856 return IC.replaceInstUsesWith(II, Extract);
857 }
858
859 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
860 if (!IntrPG)
861 return std::nullopt;
862
863 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
864 return std::nullopt;
865
866 const auto PTruePattern =
867 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
868
869 // Can the intrinsic's predicate be converted to a known constant index?
870 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
871 if (!MinNumElts)
872 return std::nullopt;
873
874 unsigned Idx = MinNumElts - 1;
875 // Increment the index if extracting the element after the last active
876 // predicate element.
877 if (IsAfter)
878 ++Idx;
879
880 // Ignore extracts whose index is larger than the known minimum vector
881 // length. NOTE: This is an artificial constraint where we prefer to
882 // maintain what the user asked for until an alternative is proven faster.
883 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
884 if (Idx >= PgVTy->getMinNumElements())
885 return std::nullopt;
886
887 // The intrinsic is extracting a fixed lane so use an extract instead.
888 auto *IdxTy = Type::getInt64Ty(II.getContext());
889 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
890 Extract->insertBefore(&II);
891 Extract->takeName(&II);
892 return IC.replaceInstUsesWith(II, Extract);
893}
894
895static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
896 IntrinsicInst &II) {
897 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
898 // integer variant across a variety of micro-architectures. Replace scalar
899 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
900 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
901 // depending on the micro-architecture, but has been observed as generally
902 // being faster, particularly when the CLAST[AB] op is a loop-carried
903 // dependency.
905 Builder.SetInsertPoint(&II);
906 Value *Pg = II.getArgOperand(0);
908 Value *Vec = II.getArgOperand(2);
909 Type *Ty = II.getType();
910
911 if (!Ty->isIntegerTy())
912 return std::nullopt;
913
914 Type *FPTy;
915 switch (cast<IntegerType>(Ty)->getBitWidth()) {
916 default:
917 return std::nullopt;
918 case 16:
919 FPTy = Builder.getHalfTy();
920 break;
921 case 32:
922 FPTy = Builder.getFloatTy();
923 break;
924 case 64:
925 FPTy = Builder.getDoubleTy();
926 break;
927 }
928
929 Value *FPFallBack = Builder.CreateBitCast(Fallback, FPTy);
930 auto *FPVTy = VectorType::get(
931 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
932 Value *FPVec = Builder.CreateBitCast(Vec, FPVTy);
933 auto *FPII = Builder.CreateIntrinsic(II.getIntrinsicID(), {FPVec->getType()},
934 {Pg, FPFallBack, FPVec});
935 Value *FPIItoInt = Builder.CreateBitCast(FPII, II.getType());
936 return IC.replaceInstUsesWith(II, FPIItoInt);
937}
938
939static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
940 IntrinsicInst &II) {
941 LLVMContext &Ctx = II.getContext();
942 IRBuilder<> Builder(Ctx);
943 Builder.SetInsertPoint(&II);
944 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
945 // can work with RDFFR_PP for ptest elimination.
946 auto *AllPat =
947 ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all);
948 auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue,
949 {II.getType()}, {AllPat});
950 auto *RDFFR =
951 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z, {}, {PTrue});
952 RDFFR->takeName(&II);
953 return IC.replaceInstUsesWith(II, RDFFR);
954}
955
956static std::optional<Instruction *>
958 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
959
960 if (Pattern == AArch64SVEPredPattern::all) {
961 LLVMContext &Ctx = II.getContext();
962 IRBuilder<> Builder(Ctx);
963 Builder.SetInsertPoint(&II);
964
965 Constant *StepVal = ConstantInt::get(II.getType(), NumElts);
966 auto *VScale = Builder.CreateVScale(StepVal);
967 VScale->takeName(&II);
968 return IC.replaceInstUsesWith(II, VScale);
969 }
970
971 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
972
973 return MinNumElts && NumElts >= MinNumElts
974 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
975 II, ConstantInt::get(II.getType(), MinNumElts)))
976 : std::nullopt;
977}
978
979static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
980 IntrinsicInst &II) {
981 Value *PgVal = II.getArgOperand(0);
982 Value *OpVal = II.getArgOperand(1);
983
985 Builder.SetInsertPoint(&II);
986
987 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
988 // Later optimizations prefer this form.
989 if (PgVal == OpVal &&
990 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
991 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
992 Value *Ops[] = {PgVal, OpVal};
993 Type *Tys[] = {PgVal->getType()};
994
995 auto *PTest =
996 Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
997 PTest->takeName(&II);
998
999 return IC.replaceInstUsesWith(II, PTest);
1000 }
1001
1002 IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(PgVal);
1003 IntrinsicInst *Op = dyn_cast<IntrinsicInst>(OpVal);
1004
1005 if (!Pg || !Op)
1006 return std::nullopt;
1007
1008 Intrinsic::ID OpIID = Op->getIntrinsicID();
1009
1010 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1011 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1012 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
1013 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
1014 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
1015
1016 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1017
1018 PTest->takeName(&II);
1019 return IC.replaceInstUsesWith(II, PTest);
1020 }
1021
1022 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
1023 // Later optimizations may rewrite sequence to use the flag-setting variant
1024 // of instruction X to remove PTEST.
1025 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1026 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1027 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1028 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1029 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1030 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1031 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1032 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1033 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1034 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1035 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1036 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1037 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1038 Value *Ops[] = {Pg->getArgOperand(0), Pg};
1039 Type *Tys[] = {Pg->getType()};
1040
1041 auto *PTest = Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
1042 PTest->takeName(&II);
1043
1044 return IC.replaceInstUsesWith(II, PTest);
1045 }
1046
1047 return std::nullopt;
1048}
1049
1050template <Intrinsic::ID MulOpc, typename Intrinsic::ID FuseOpc>
1051static std::optional<Instruction *>
1053 bool MergeIntoAddendOp) {
1054 Value *P = II.getOperand(0);
1055 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
1056 if (MergeIntoAddendOp) {
1057 AddendOp = II.getOperand(1);
1058 Mul = II.getOperand(2);
1059 } else {
1060 AddendOp = II.getOperand(2);
1061 Mul = II.getOperand(1);
1062 }
1063
1064 if (!match(Mul, m_Intrinsic<MulOpc>(m_Specific(P), m_Value(MulOp0),
1065 m_Value(MulOp1))))
1066 return std::nullopt;
1067
1068 if (!Mul->hasOneUse())
1069 return std::nullopt;
1070
1071 Instruction *FMFSource = nullptr;
1072 if (II.getType()->isFPOrFPVectorTy()) {
1073 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
1074 // Stop the combine when the flags on the inputs differ in case dropping
1075 // flags would lead to us missing out on more beneficial optimizations.
1076 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
1077 return std::nullopt;
1078 if (!FAddFlags.allowContract())
1079 return std::nullopt;
1080 FMFSource = &II;
1081 }
1082
1084 Builder.SetInsertPoint(&II);
1085
1086 CallInst *Res;
1087 if (MergeIntoAddendOp)
1088 Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1089 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
1090 else
1091 Res = Builder.CreateIntrinsic(FuseOpc, {II.getType()},
1092 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
1093
1094 return IC.replaceInstUsesWith(II, Res);
1095}
1096
1097static bool isAllActivePredicate(Value *Pred) {
1098 // Look through convert.from.svbool(convert.to.svbool(...) chain.
1099 Value *UncastedPred;
1100 if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
1101 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
1102 m_Value(UncastedPred)))))
1103 // If the predicate has the same or less lanes than the uncasted
1104 // predicate then we know the casting has no effect.
1105 if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
1106 cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
1107 Pred = UncastedPred;
1108
1109 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1110 m_ConstantInt<AArch64SVEPredPattern::all>()));
1111}
1112
1113static std::optional<Instruction *>
1116 Builder.SetInsertPoint(&II);
1117
1118 Value *Pred = II.getOperand(0);
1119 Value *PtrOp = II.getOperand(1);
1120 Type *VecTy = II.getType();
1121 Value *VecPtr = Builder.CreateBitCast(PtrOp, VecTy->getPointerTo());
1122
1123 if (isAllActivePredicate(Pred)) {
1124 LoadInst *Load = Builder.CreateLoad(VecTy, VecPtr);
1125 Load->copyMetadata(II);
1126 return IC.replaceInstUsesWith(II, Load);
1127 }
1128
1129 CallInst *MaskedLoad =
1130 Builder.CreateMaskedLoad(VecTy, VecPtr, PtrOp->getPointerAlignment(DL),
1131 Pred, ConstantAggregateZero::get(VecTy));
1132 MaskedLoad->copyMetadata(II);
1133 return IC.replaceInstUsesWith(II, MaskedLoad);
1134}
1135
1136static std::optional<Instruction *>
1139 Builder.SetInsertPoint(&II);
1140
1141 Value *VecOp = II.getOperand(0);
1142 Value *Pred = II.getOperand(1);
1143 Value *PtrOp = II.getOperand(2);
1144 Value *VecPtr =
1145 Builder.CreateBitCast(PtrOp, VecOp->getType()->getPointerTo());
1146
1147 if (isAllActivePredicate(Pred)) {
1148 StoreInst *Store = Builder.CreateStore(VecOp, VecPtr);
1149 Store->copyMetadata(II);
1150 return IC.eraseInstFromFunction(II);
1151 }
1152
1153 CallInst *MaskedStore = Builder.CreateMaskedStore(
1154 VecOp, VecPtr, PtrOp->getPointerAlignment(DL), Pred);
1155 MaskedStore->copyMetadata(II);
1156 return IC.eraseInstFromFunction(II);
1157}
1158
1160 switch (Intrinsic) {
1161 case Intrinsic::aarch64_sve_fmul:
1162 return Instruction::BinaryOps::FMul;
1163 case Intrinsic::aarch64_sve_fadd:
1164 return Instruction::BinaryOps::FAdd;
1165 case Intrinsic::aarch64_sve_fsub:
1166 return Instruction::BinaryOps::FSub;
1167 default:
1168 return Instruction::BinaryOpsEnd;
1169 }
1170}
1171
1172static std::optional<Instruction *>
1174 auto *OpPredicate = II.getOperand(0);
1175 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
1176 if (BinOpCode == Instruction::BinaryOpsEnd ||
1177 !match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1178 m_ConstantInt<AArch64SVEPredPattern::all>())))
1179 return std::nullopt;
1181 Builder.SetInsertPoint(&II);
1182 Builder.setFastMathFlags(II.getFastMathFlags());
1183 auto BinOp =
1184 Builder.CreateBinOp(BinOpCode, II.getOperand(1), II.getOperand(2));
1185 return IC.replaceInstUsesWith(II, BinOp);
1186}
1187
1188static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
1189 IntrinsicInst &II) {
1190 if (auto FMLA =
1191 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1192 Intrinsic::aarch64_sve_fmla>(IC, II,
1193 true))
1194 return FMLA;
1195 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1196 Intrinsic::aarch64_sve_mla>(
1197 IC, II, true))
1198 return MLA;
1199 if (auto FMAD =
1200 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1201 Intrinsic::aarch64_sve_fmad>(IC, II,
1202 false))
1203 return FMAD;
1204 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1205 Intrinsic::aarch64_sve_mad>(
1206 IC, II, false))
1207 return MAD;
1208 return instCombineSVEVectorBinOp(IC, II);
1209}
1210
1211static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
1212 IntrinsicInst &II) {
1213 if (auto FMLS =
1214 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1215 Intrinsic::aarch64_sve_fmls>(IC, II,
1216 true))
1217 return FMLS;
1218 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
1219 Intrinsic::aarch64_sve_mls>(
1220 IC, II, true))
1221 return MLS;
1222 if (auto FMSB =
1223 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
1224 Intrinsic::aarch64_sve_fnmsb>(
1225 IC, II, false))
1226 return FMSB;
1227 return instCombineSVEVectorBinOp(IC, II);
1228}
1229
1230static std::optional<Instruction *> instCombineSVEVectorMul(InstCombiner &IC,
1231 IntrinsicInst &II) {
1232 auto *OpPredicate = II.getOperand(0);
1233 auto *OpMultiplicand = II.getOperand(1);
1234 auto *OpMultiplier = II.getOperand(2);
1235
1237 Builder.SetInsertPoint(&II);
1238
1239 // Return true if a given instruction is a unit splat value, false otherwise.
1240 auto IsUnitSplat = [](auto *I) {
1241 auto *SplatValue = getSplatValue(I);
1242 if (!SplatValue)
1243 return false;
1244 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1245 };
1246
1247 // Return true if a given instruction is an aarch64_sve_dup intrinsic call
1248 // with a unit splat value, false otherwise.
1249 auto IsUnitDup = [](auto *I) {
1250 auto *IntrI = dyn_cast<IntrinsicInst>(I);
1251 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1252 return false;
1253
1254 auto *SplatValue = IntrI->getOperand(2);
1255 return match(SplatValue, m_FPOne()) || match(SplatValue, m_One());
1256 };
1257
1258 if (IsUnitSplat(OpMultiplier)) {
1259 // [f]mul pg %n, (dupx 1) => %n
1260 OpMultiplicand->takeName(&II);
1261 return IC.replaceInstUsesWith(II, OpMultiplicand);
1262 } else if (IsUnitDup(OpMultiplier)) {
1263 // [f]mul pg %n, (dup pg 1) => %n
1264 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1265 auto *DupPg = DupInst->getOperand(1);
1266 // TODO: this is naive. The optimization is still valid if DupPg
1267 // 'encompasses' OpPredicate, not only if they're the same predicate.
1268 if (OpPredicate == DupPg) {
1269 OpMultiplicand->takeName(&II);
1270 return IC.replaceInstUsesWith(II, OpMultiplicand);
1271 }
1272 }
1273
1274 return instCombineSVEVectorBinOp(IC, II);
1275}
1276
1277static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
1278 IntrinsicInst &II) {
1280 Builder.SetInsertPoint(&II);
1281 Value *UnpackArg = II.getArgOperand(0);
1282 auto *RetTy = cast<ScalableVectorType>(II.getType());
1283 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1284 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
1285
1286 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
1287 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
1288 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
1289 ScalarArg =
1290 Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
1291 Value *NewVal =
1292 Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
1293 NewVal->takeName(&II);
1294 return IC.replaceInstUsesWith(II, NewVal);
1295 }
1296
1297 return std::nullopt;
1298}
1299static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
1300 IntrinsicInst &II) {
1301 auto *OpVal = II.getOperand(0);
1302 auto *OpIndices = II.getOperand(1);
1303 VectorType *VTy = cast<VectorType>(II.getType());
1304
1305 // Check whether OpIndices is a constant splat value < minimal element count
1306 // of result.
1307 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
1308 if (!SplatValue ||
1309 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1310 return std::nullopt;
1311
1312 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
1313 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
1315 Builder.SetInsertPoint(&II);
1316 auto *Extract = Builder.CreateExtractElement(OpVal, SplatValue);
1317 auto *VectorSplat =
1318 Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
1319
1320 VectorSplat->takeName(&II);
1321 return IC.replaceInstUsesWith(II, VectorSplat);
1322}
1323
1324static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
1325 IntrinsicInst &II) {
1326 // zip1(uzp1(A, B), uzp2(A, B)) --> A
1327 // zip2(uzp1(A, B), uzp2(A, B)) --> B
1328 Value *A, *B;
1329 if (match(II.getArgOperand(0),
1330 m_Intrinsic<Intrinsic::aarch64_sve_uzp1>(m_Value(A), m_Value(B))) &&
1331 match(II.getArgOperand(1), m_Intrinsic<Intrinsic::aarch64_sve_uzp2>(
1332 m_Specific(A), m_Specific(B))))
1333 return IC.replaceInstUsesWith(
1334 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
1335
1336 return std::nullopt;
1337}
1338
1339static std::optional<Instruction *>
1341 Value *Mask = II.getOperand(0);
1342 Value *BasePtr = II.getOperand(1);
1343 Value *Index = II.getOperand(2);
1344 Type *Ty = II.getType();
1345 Value *PassThru = ConstantAggregateZero::get(Ty);
1346
1347 // Contiguous gather => masked load.
1348 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
1349 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
1350 Value *IndexBase;
1351 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1352 m_Value(IndexBase), m_SpecificInt(1)))) {
1354 Builder.SetInsertPoint(&II);
1355
1356 Align Alignment =
1357 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1358
1359 Type *VecPtrTy = PointerType::getUnqual(Ty);
1360 Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1361 BasePtr, IndexBase);
1362 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1363 CallInst *MaskedLoad =
1364 Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
1365 MaskedLoad->takeName(&II);
1366 return IC.replaceInstUsesWith(II, MaskedLoad);
1367 }
1368
1369 return std::nullopt;
1370}
1371
1372static std::optional<Instruction *>
1374 Value *Val = II.getOperand(0);
1375 Value *Mask = II.getOperand(1);
1376 Value *BasePtr = II.getOperand(2);
1377 Value *Index = II.getOperand(3);
1378 Type *Ty = Val->getType();
1379
1380 // Contiguous scatter => masked store.
1381 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
1382 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
1383 Value *IndexBase;
1384 if (match(Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1385 m_Value(IndexBase), m_SpecificInt(1)))) {
1387 Builder.SetInsertPoint(&II);
1388
1389 Align Alignment =
1390 BasePtr->getPointerAlignment(II.getModule()->getDataLayout());
1391
1392 Value *Ptr = Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
1393 BasePtr, IndexBase);
1394 Type *VecPtrTy = PointerType::getUnqual(Ty);
1395 Ptr = Builder.CreateBitCast(Ptr, VecPtrTy);
1396
1397 (void)Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
1398
1399 return IC.eraseInstFromFunction(II);
1400 }
1401
1402 return std::nullopt;
1403}
1404
1405static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
1406 IntrinsicInst &II) {
1408 Builder.SetInsertPoint(&II);
1409 Type *Int32Ty = Builder.getInt32Ty();
1410 Value *Pred = II.getOperand(0);
1411 Value *Vec = II.getOperand(1);
1412 Value *DivVec = II.getOperand(2);
1413
1414 Value *SplatValue = getSplatValue(DivVec);
1415 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1416 if (!SplatConstantInt)
1417 return std::nullopt;
1418 APInt Divisor = SplatConstantInt->getValue();
1419
1420 if (Divisor.isPowerOf2()) {
1421 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1422 auto ASRD = Builder.CreateIntrinsic(
1423 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1424 return IC.replaceInstUsesWith(II, ASRD);
1425 }
1426 if (Divisor.isNegatedPowerOf2()) {
1427 Divisor.negate();
1428 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
1429 auto ASRD = Builder.CreateIntrinsic(
1430 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
1431 auto NEG = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_neg,
1432 {ASRD->getType()}, {ASRD, Pred, ASRD});
1433 return IC.replaceInstUsesWith(II, NEG);
1434 }
1435
1436 return std::nullopt;
1437}
1438
1439bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
1440 size_t VecSize = Vec.size();
1441 if (VecSize == 1)
1442 return true;
1443 if (!isPowerOf2_64(VecSize))
1444 return false;
1445 size_t HalfVecSize = VecSize / 2;
1446
1447 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
1448 RHS != Vec.end(); LHS++, RHS++) {
1449 if (*LHS != nullptr && *RHS != nullptr) {
1450 if (*LHS == *RHS)
1451 continue;
1452 else
1453 return false;
1454 }
1455 if (!AllowPoison)
1456 return false;
1457 if (*LHS == nullptr && *RHS != nullptr)
1458 *LHS = *RHS;
1459 }
1460
1461 Vec.resize(HalfVecSize);
1462 SimplifyValuePattern(Vec, AllowPoison);
1463 return true;
1464}
1465
1466// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
1467// to dupqlane(f64(C)) where C is A concatenated with B
1468static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
1469 IntrinsicInst &II) {
1470 Value *CurrentInsertElt = nullptr, *Default = nullptr;
1471 if (!match(II.getOperand(0),
1472 m_Intrinsic<Intrinsic::vector_insert>(
1473 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
1474 !isa<FixedVectorType>(CurrentInsertElt->getType()))
1475 return std::nullopt;
1476 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
1477
1478 // Insert the scalars into a container ordered by InsertElement index
1479 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
1480 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
1481 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
1482 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
1483 CurrentInsertElt = InsertElt->getOperand(0);
1484 }
1485
1486 bool AllowPoison =
1487 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
1488 if (!SimplifyValuePattern(Elts, AllowPoison))
1489 return std::nullopt;
1490
1491 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
1493 Builder.SetInsertPoint(&II);
1494 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
1495 for (size_t I = 0; I < Elts.size(); I++) {
1496 if (Elts[I] == nullptr)
1497 continue;
1498 InsertEltChain = Builder.CreateInsertElement(InsertEltChain, Elts[I],
1499 Builder.getInt64(I));
1500 }
1501 if (InsertEltChain == nullptr)
1502 return std::nullopt;
1503
1504 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
1505 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
1506 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
1507 // be narrowed back to the original type.
1508 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
1509 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
1510 IIScalableTy->getMinNumElements() /
1511 PatternWidth;
1512
1513 IntegerType *WideTy = Builder.getIntNTy(PatternWidth);
1514 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
1515 auto *WideShuffleMaskTy =
1516 ScalableVectorType::get(Builder.getInt32Ty(), PatternElementCount);
1517
1518 auto ZeroIdx = ConstantInt::get(Builder.getInt64Ty(), APInt(64, 0));
1519 auto InsertSubvector = Builder.CreateInsertVector(
1520 II.getType(), PoisonValue::get(II.getType()), InsertEltChain, ZeroIdx);
1521 auto WideBitcast =
1522 Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
1523 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
1524 auto WideShuffle = Builder.CreateShuffleVector(
1525 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
1526 auto NarrowBitcast =
1527 Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
1528
1529 return IC.replaceInstUsesWith(II, NarrowBitcast);
1530}
1531
1532static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
1533 IntrinsicInst &II) {
1534 Value *A = II.getArgOperand(0);
1535 Value *B = II.getArgOperand(1);
1536 if (A == B)
1537 return IC.replaceInstUsesWith(II, A);
1538
1539 return std::nullopt;
1540}
1541
1542static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
1543 IntrinsicInst &II) {
1544 IRBuilder<> Builder(&II);
1545 Value *Pred = II.getOperand(0);
1546 Value *Vec = II.getOperand(1);
1547 Value *Shift = II.getOperand(2);
1548
1549 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
1550 Value *AbsPred, *MergedValue;
1551 if (!match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
1552 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
1553 !match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
1554 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
1555
1556 return std::nullopt;
1557
1558 // Transform is valid if any of the following are true:
1559 // * The ABS merge value is an undef or non-negative
1560 // * The ABS predicate is all active
1561 // * The ABS predicate and the SRSHL predicates are the same
1562 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
1563 AbsPred != Pred && !isAllActivePredicate(AbsPred))
1564 return std::nullopt;
1565
1566 // Only valid when the shift amount is non-negative, otherwise the rounding
1567 // behaviour of SRSHL cannot be ignored.
1568 if (!match(Shift, m_NonNegative()))
1569 return std::nullopt;
1570
1571 auto LSL = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl, {II.getType()},
1572 {Pred, Vec, Shift});
1573
1574 return IC.replaceInstUsesWith(II, LSL);
1575}
1576
1577std::optional<Instruction *>
1579 IntrinsicInst &II) const {
1580 Intrinsic::ID IID = II.getIntrinsicID();
1581 switch (IID) {
1582 default:
1583 break;
1584 case Intrinsic::aarch64_neon_fmaxnm:
1585 case Intrinsic::aarch64_neon_fminnm:
1586 return instCombineMaxMinNM(IC, II);
1587 case Intrinsic::aarch64_sve_convert_from_svbool:
1588 return instCombineConvertFromSVBool(IC, II);
1589 case Intrinsic::aarch64_sve_dup:
1590 return instCombineSVEDup(IC, II);
1591 case Intrinsic::aarch64_sve_dup_x:
1592 return instCombineSVEDupX(IC, II);
1593 case Intrinsic::aarch64_sve_cmpne:
1594 case Intrinsic::aarch64_sve_cmpne_wide:
1595 return instCombineSVECmpNE(IC, II);
1596 case Intrinsic::aarch64_sve_rdffr:
1597 return instCombineRDFFR(IC, II);
1598 case Intrinsic::aarch64_sve_lasta:
1599 case Intrinsic::aarch64_sve_lastb:
1600 return instCombineSVELast(IC, II);
1601 case Intrinsic::aarch64_sve_clasta_n:
1602 case Intrinsic::aarch64_sve_clastb_n:
1603 return instCombineSVECondLast(IC, II);
1604 case Intrinsic::aarch64_sve_cntd:
1605 return instCombineSVECntElts(IC, II, 2);
1606 case Intrinsic::aarch64_sve_cntw:
1607 return instCombineSVECntElts(IC, II, 4);
1608 case Intrinsic::aarch64_sve_cnth:
1609 return instCombineSVECntElts(IC, II, 8);
1610 case Intrinsic::aarch64_sve_cntb:
1611 return instCombineSVECntElts(IC, II, 16);
1612 case Intrinsic::aarch64_sve_ptest_any:
1613 case Intrinsic::aarch64_sve_ptest_first:
1614 case Intrinsic::aarch64_sve_ptest_last:
1615 return instCombineSVEPTest(IC, II);
1616 case Intrinsic::aarch64_sve_mul:
1617 case Intrinsic::aarch64_sve_fmul:
1618 return instCombineSVEVectorMul(IC, II);
1619 case Intrinsic::aarch64_sve_fadd:
1620 case Intrinsic::aarch64_sve_add:
1621 return instCombineSVEVectorAdd(IC, II);
1622 case Intrinsic::aarch64_sve_fsub:
1623 case Intrinsic::aarch64_sve_sub:
1624 return instCombineSVEVectorSub(IC, II);
1625 case Intrinsic::aarch64_sve_tbl:
1626 return instCombineSVETBL(IC, II);
1627 case Intrinsic::aarch64_sve_uunpkhi:
1628 case Intrinsic::aarch64_sve_uunpklo:
1629 case Intrinsic::aarch64_sve_sunpkhi:
1630 case Intrinsic::aarch64_sve_sunpklo:
1631 return instCombineSVEUnpack(IC, II);
1632 case Intrinsic::aarch64_sve_zip1:
1633 case Intrinsic::aarch64_sve_zip2:
1634 return instCombineSVEZip(IC, II);
1635 case Intrinsic::aarch64_sve_ld1_gather_index:
1636 return instCombineLD1GatherIndex(IC, II);
1637 case Intrinsic::aarch64_sve_st1_scatter_index:
1638 return instCombineST1ScatterIndex(IC, II);
1639 case Intrinsic::aarch64_sve_ld1:
1640 return instCombineSVELD1(IC, II, DL);
1641 case Intrinsic::aarch64_sve_st1:
1642 return instCombineSVEST1(IC, II, DL);
1643 case Intrinsic::aarch64_sve_sdiv:
1644 return instCombineSVESDIV(IC, II);
1645 case Intrinsic::aarch64_sve_sel:
1646 return instCombineSVESel(IC, II);
1647 case Intrinsic::aarch64_sve_srshl:
1648 return instCombineSVESrshl(IC, II);
1649 case Intrinsic::aarch64_sve_dupq_lane:
1650 return instCombineSVEDupqLane(IC, II);
1651 }
1652
1653 return std::nullopt;
1654}
1655
1657 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
1658 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
1659 std::function<void(Instruction *, unsigned, APInt, APInt &)>
1660 SimplifyAndSetOp) const {
1661 switch (II.getIntrinsicID()) {
1662 default:
1663 break;
1664 case Intrinsic::aarch64_neon_fcvtxn:
1665 case Intrinsic::aarch64_neon_rshrn:
1666 case Intrinsic::aarch64_neon_sqrshrn:
1667 case Intrinsic::aarch64_neon_sqrshrun:
1668 case Intrinsic::aarch64_neon_sqshrn:
1669 case Intrinsic::aarch64_neon_sqshrun:
1670 case Intrinsic::aarch64_neon_sqxtn:
1671 case Intrinsic::aarch64_neon_sqxtun:
1672 case Intrinsic::aarch64_neon_uqrshrn:
1673 case Intrinsic::aarch64_neon_uqshrn:
1674 case Intrinsic::aarch64_neon_uqxtn:
1675 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
1676 break;
1677 }
1678
1679 return std::nullopt;
1680}
1681
1684 switch (K) {
1686 return TypeSize::getFixed(64);
1688 if (!ST->isStreamingSVEModeDisabled() &&
1690 return TypeSize::getFixed(0);
1691
1692 if (ST->hasSVE())
1693 return TypeSize::getFixed(
1694 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
1695
1696 return TypeSize::getFixed(ST->hasNEON() ? 128 : 0);
1699 return TypeSize::getScalable(0);
1700
1701 return TypeSize::getScalable(ST->hasSVE() ? 128 : 0);
1702 }
1703 llvm_unreachable("Unsupported register kind");
1704}
1705
1706bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
1708
1709 // A helper that returns a vector type from the given type. The number of
1710 // elements in type Ty determines the vector width.
1711 auto toVectorTy = [&](Type *ArgTy) {
1712 return VectorType::get(ArgTy->getScalarType(),
1713 cast<VectorType>(DstTy)->getElementCount());
1714 };
1715
1716 // Exit early if DstTy is not a vector type whose elements are at least
1717 // 16-bits wide. SVE doesn't generally have the same set of instructions to
1718 // perform an extend with the add/sub/mul. There are SMULLB style
1719 // instructions, but they operate on top/bottom, requiring some sort of lane
1720 // interleaving to be used with zext/sext.
1721 if (!useNeonVector(DstTy) || DstTy->getScalarSizeInBits() < 16)
1722 return false;
1723
1724 // Determine if the operation has a widening variant. We consider both the
1725 // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the
1726 // instructions.
1727 //
1728 // TODO: Add additional widening operations (e.g., shl, etc.) once we
1729 // verify that their extending operands are eliminated during code
1730 // generation.
1731 switch (Opcode) {
1732 case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2).
1733 case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2).
1734 case Instruction::Mul: // SMULL(2), UMULL(2)
1735 break;
1736 default:
1737 return false;
1738 }
1739
1740 // To be a widening instruction (either the "wide" or "long" versions), the
1741 // second operand must be a sign- or zero extend.
1742 if (Args.size() != 2 ||
1743 (!isa<SExtInst>(Args[1]) && !isa<ZExtInst>(Args[1])))
1744 return false;
1745 auto *Extend = cast<CastInst>(Args[1]);
1746 auto *Arg0 = dyn_cast<CastInst>(Args[0]);
1747
1748 // A mul only has a mull version (not like addw). Both operands need to be
1749 // extending and the same type.
1750 if (Opcode == Instruction::Mul &&
1751 (!Arg0 || Arg0->getOpcode() != Extend->getOpcode() ||
1752 Arg0->getOperand(0)->getType() != Extend->getOperand(0)->getType()))
1753 return false;
1754
1755 // Legalize the destination type and ensure it can be used in a widening
1756 // operation.
1757 auto DstTyL = getTypeLegalizationCost(DstTy);
1758 unsigned DstElTySize = DstTyL.second.getScalarSizeInBits();
1759 if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits())
1760 return false;
1761
1762 // Legalize the source type and ensure it can be used in a widening
1763 // operation.
1764 auto *SrcTy = toVectorTy(Extend->getSrcTy());
1765 auto SrcTyL = getTypeLegalizationCost(SrcTy);
1766 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
1767 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
1768 return false;
1769
1770 // Get the total number of vector elements in the legalized types.
1771 InstructionCost NumDstEls =
1772 DstTyL.first * DstTyL.second.getVectorMinNumElements();
1773 InstructionCost NumSrcEls =
1774 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
1775
1776 // Return true if the legalized types have the same number of vector elements
1777 // and the destination element type size is twice that of the source type.
1778 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize;
1779}
1780
1782 Type *Src,
1785 const Instruction *I) {
1786 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1787 assert(ISD && "Invalid opcode");
1788
1789 // If the cast is observable, and it is used by a widening instruction (e.g.,
1790 // uaddl, saddw, etc.), it may be free.
1791 if (I && I->hasOneUser()) {
1792 auto *SingleUser = cast<Instruction>(*I->user_begin());
1793 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
1794 if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) {
1795 // If the cast is the second operand, it is free. We will generate either
1796 // a "wide" or "long" version of the widening instruction.
1797 if (I == SingleUser->getOperand(1))
1798 return 0;
1799 // If the cast is not the second operand, it will be free if it looks the
1800 // same as the second operand. In this case, we will generate a "long"
1801 // version of the widening instruction.
1802 if (auto *Cast = dyn_cast<CastInst>(SingleUser->getOperand(1)))
1803 if (I->getOpcode() == unsigned(Cast->getOpcode()) &&
1804 cast<CastInst>(I)->getSrcTy() == Cast->getSrcTy())
1805 return 0;
1806 }
1807 }
1808
1809 // TODO: Allow non-throughput costs that aren't binary.
1810 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
1812 return Cost == 0 ? 0 : 1;
1813 return Cost;
1814 };
1815
1816 EVT SrcTy = TLI->getValueType(DL, Src);
1817 EVT DstTy = TLI->getValueType(DL, Dst);
1818
1819 if (!SrcTy.isSimple() || !DstTy.isSimple())
1820 return AdjustCost(
1821 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
1822
1823 static const TypeConversionCostTblEntry
1824 ConversionTbl[] = {
1825 { ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
1826 { ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
1827 { ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
1828 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
1829 { ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
1830 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
1831 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
1832 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
1833 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
1834 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
1835 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
1836 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
1837 { ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
1838 { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
1839 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
1840 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
1841 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
1842 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
1843 { ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
1844 { ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
1845
1846 // Truncations on nxvmiN
1863
1864 // The number of shll instructions for the extension.
1881
1882 // LowerVectorINT_TO_FP:
1889
1890 // Complex: to v2f32
1897
1898 // Complex: to v4f32
1903
1904 // Complex: to v8f32
1909
1910 // Complex: to v16f32
1913
1914 // Complex: to v2f64
1921
1922 // Complex: to v4f64
1925
1926 // LowerVectorFP_TO_INT
1933
1934 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
1941
1942 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
1947
1948 // Complex, from nxv2f32.
1957
1958 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
1965
1966 // Complex, from nxv2f64.
1975
1976 // Complex, from nxv4f32.
1985
1986 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
1991
1992 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
1999
2000 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
2005
2006 // Complex, from nxv8f16.
2015
2016 // Complex, from nxv4f16.
2025
2026 // Complex, from nxv2f16.
2035
2036 // Truncate from nxvmf32 to nxvmf16.
2040
2041 // Truncate from nxvmf64 to nxvmf16.
2045
2046 // Truncate from nxvmf64 to nxvmf32.
2050
2051 // Extend from nxvmf16 to nxvmf32.
2055
2056 // Extend from nxvmf16 to nxvmf64.
2060
2061 // Extend from nxvmf32 to nxvmf64.
2065
2066 // Bitcasts from float to integer
2070
2071 // Bitcasts from integer to float
2075 };
2076
2077 if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
2078 DstTy.getSimpleVT(),
2079 SrcTy.getSimpleVT()))
2080 return AdjustCost(Entry->Cost);
2081
2082 static const TypeConversionCostTblEntry FP16Tbl[] = {
2083 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
2085 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
2087 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
2089 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
2091 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
2093 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
2095 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
2097 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
2099 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
2101 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
2102 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
2103 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
2104 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
2105 };
2106
2107 if (ST->hasFullFP16())
2108 if (const auto *Entry = ConvertCostTableLookup(
2109 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
2110 return AdjustCost(Entry->Cost);
2111
2112 return AdjustCost(
2113 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
2114}
2115
2117 Type *Dst,
2118 VectorType *VecTy,
2119 unsigned Index) {
2120
2121 // Make sure we were given a valid extend opcode.
2122 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
2123 "Invalid opcode");
2124
2125 // We are extending an element we extract from a vector, so the source type
2126 // of the extend is the element type of the vector.
2127 auto *Src = VecTy->getElementType();
2128
2129 // Sign- and zero-extends are for integer types only.
2130 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
2131
2132 // Get the cost for the extract. We compute the cost (if any) for the extend
2133 // below.
2135 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
2136 CostKind, Index, nullptr, nullptr);
2137
2138 // Legalize the types.
2139 auto VecLT = getTypeLegalizationCost(VecTy);
2140 auto DstVT = TLI->getValueType(DL, Dst);
2141 auto SrcVT = TLI->getValueType(DL, Src);
2142
2143 // If the resulting type is still a vector and the destination type is legal,
2144 // we may get the extension for free. If not, get the default cost for the
2145 // extend.
2146 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
2147 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2148 CostKind);
2149
2150 // The destination type should be larger than the element type. If not, get
2151 // the default cost for the extend.
2152 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2153 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2154 CostKind);
2155
2156 switch (Opcode) {
2157 default:
2158 llvm_unreachable("Opcode should be either SExt or ZExt");
2159
2160 // For sign-extends, we only need a smov, which performs the extension
2161 // automatically.
2162 case Instruction::SExt:
2163 return Cost;
2164
2165 // For zero-extends, the extend is performed automatically by a umov unless
2166 // the destination type is i64 and the element type is i8 or i16.
2167 case Instruction::ZExt:
2168 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
2169 return Cost;
2170 }
2171
2172 // If we are unable to perform the extend for free, get the default cost.
2173 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
2174 CostKind);
2175}
2176
2179 const Instruction *I) {
2181 return Opcode == Instruction::PHI ? 0 : 1;
2182 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
2183 // Branches are assumed to be predicted.
2184 return 0;
2185}
2186
2187InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(Type *Val,
2188 unsigned Index,
2189 bool HasRealUse) {
2190 assert(Val->isVectorTy() && "This must be a vector type");
2191
2192 if (Index != -1U) {
2193 // Legalize the type.
2194 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2195
2196 // This type is legalized to a scalar type.
2197 if (!LT.second.isVector())
2198 return 0;
2199
2200 // The type may be split. For fixed-width vectors we can normalize the
2201 // index to the new type.
2202 if (LT.second.isFixedLengthVector()) {
2203 unsigned Width = LT.second.getVectorNumElements();
2204 Index = Index % Width;
2205 }
2206
2207 // The element at index zero is already inside the vector.
2208 // - For a physical (HasRealUse==true) insert-element or extract-element
2209 // instruction that extracts integers, an explicit FPR -> GPR move is
2210 // needed. So it has non-zero cost.
2211 // - For the rest of cases (virtual instruction or element type is float),
2212 // consider the instruction free.
2213 //
2214 // FIXME:
2215 // If the extract-element and insert-element instructions could be
2216 // simplified away (e.g., could be combined into users by looking at use-def
2217 // context), they have no cost. This is not done in the first place for
2218 // compile-time considerations.
2219 if (Index == 0 && (!HasRealUse || !Val->getScalarType()->isIntegerTy()))
2220 return 0;
2221 }
2222
2223 // All other insert/extracts cost this much.
2224 return ST->getVectorInsertExtractBaseCost();
2225}
2226
2229 unsigned Index, Value *Op0,
2230 Value *Op1) {
2231 return getVectorInstrCostHelper(Val, Index, false /* HasRealUse */);
2232}
2233
2235 Type *Val,
2237 unsigned Index) {
2238 return getVectorInstrCostHelper(Val, Index, true /* HasRealUse */);
2239}
2240
2242 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2245 const Instruction *CxtI) {
2246
2247 // TODO: Handle more cost kinds.
2249 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2250 Op2Info, Args, CxtI);
2251
2252 // Legalize the type.
2253 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2254 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2255
2256 switch (ISD) {
2257 default:
2258 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2259 Op2Info);
2260 case ISD::SDIV:
2261 if (Op2Info.isConstant() && Op2Info.isUniform() && Op2Info.isPowerOf2()) {
2262 // On AArch64, scalar signed division by constants power-of-two are
2263 // normally expanded to the sequence ADD + CMP + SELECT + SRA.
2264 // The OperandValue properties many not be same as that of previous
2265 // operation; conservatively assume OP_None.
2267 Instruction::Add, Ty, CostKind,
2268 Op1Info.getNoProps(), Op2Info.getNoProps());
2269 Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
2270 Op1Info.getNoProps(), Op2Info.getNoProps());
2272 Instruction::Select, Ty, CostKind,
2273 Op1Info.getNoProps(), Op2Info.getNoProps());
2274 Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
2275 Op1Info.getNoProps(), Op2Info.getNoProps());
2276 return Cost;
2277 }
2278 [[fallthrough]];
2279 case ISD::UDIV: {
2280 if (Op2Info.isConstant() && Op2Info.isUniform()) {
2281 auto VT = TLI->getValueType(DL, Ty);
2282 if (TLI->isOperationLegalOrCustom(ISD::MULHU, VT)) {
2283 // Vector signed division by constant are expanded to the
2284 // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
2285 // to MULHS + SUB + SRL + ADD + SRL.
2287 Instruction::Mul, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2289 Instruction::Add, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2291 Instruction::AShr, Ty, CostKind, Op1Info.getNoProps(), Op2Info.getNoProps());
2292 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
2293 }
2294 }
2295
2297 Opcode, Ty, CostKind, Op1Info, Op2Info);
2298 if (Ty->isVectorTy()) {
2299 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
2300 // SDIV/UDIV operations are lowered using SVE, then we can have less
2301 // costs.
2302 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
2303 ->getPrimitiveSizeInBits()
2304 .getFixedValue() < 128) {
2305 EVT VT = TLI->getValueType(DL, Ty);
2306 static const CostTblEntry DivTbl[]{
2313
2314 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
2315 if (nullptr != Entry)
2316 return Entry->Cost;
2317 }
2318 // For 8/16-bit elements, the cost is higher because the type
2319 // requires promotion and possibly splitting:
2320 if (LT.second.getScalarType() == MVT::i8)
2321 Cost *= 8;
2322 else if (LT.second.getScalarType() == MVT::i16)
2323 Cost *= 4;
2324 return Cost;
2325 } else {
2326 // If one of the operands is a uniform constant then the cost for each
2327 // element is Cost for insertion, extraction and division.
2328 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
2329 // operation with scalar type
2330 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
2331 (Op2Info.isConstant() && Op2Info.isUniform())) {
2332 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2334 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
2335 return (4 + DivCost) * VTy->getNumElements();
2336 }
2337 }
2338 // On AArch64, without SVE, vector divisions are expanded
2339 // into scalar divisions of each pair of elements.
2340 Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty,
2341 CostKind, Op1Info, Op2Info);
2342 Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
2343 Op1Info, Op2Info);
2344 }
2345
2346 // TODO: if one of the arguments is scalar, then it's not necessary to
2347 // double the cost of handling the vector elements.
2348 Cost += Cost;
2349 }
2350 return Cost;
2351 }
2352 case ISD::MUL:
2353 // When SVE is available, then we can lower the v2i64 operation using
2354 // the SVE mul instruction, which has a lower cost.
2355 if (LT.second == MVT::v2i64 && ST->hasSVE())
2356 return LT.first;
2357
2358 // When SVE is not available, there is no MUL.2d instruction,
2359 // which means mul <2 x i64> is expensive as elements are extracted
2360 // from the vectors and the muls scalarized.
2361 // As getScalarizationOverhead is a bit too pessimistic, we
2362 // estimate the cost for a i64 vector directly here, which is:
2363 // - four 2-cost i64 extracts,
2364 // - two 2-cost i64 inserts, and
2365 // - two 1-cost muls.
2366 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
2367 // LT.first = 2 the cost is 28. If both operands are extensions it will not
2368 // need to scalarize so the cost can be cheaper (smull or umull).
2369 // so the cost can be cheaper (smull or umull).
2370 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
2371 return LT.first;
2372 return LT.first * 14;
2373 case ISD::ADD:
2374 case ISD::XOR:
2375 case ISD::OR:
2376 case ISD::AND:
2377 case ISD::SRL:
2378 case ISD::SRA:
2379 case ISD::SHL:
2380 // These nodes are marked as 'custom' for combining purposes only.
2381 // We know that they are legal. See LowerAdd in ISelLowering.
2382 return LT.first;
2383
2384 case ISD::FADD:
2385 case ISD::FSUB:
2386 case ISD::FMUL:
2387 case ISD::FDIV:
2388 case ISD::FNEG:
2389 // These nodes are marked as 'custom' just to lower them to SVE.
2390 // We know said lowering will incur no additional cost.
2391 if (!Ty->getScalarType()->isFP128Ty())
2392 return 2 * LT.first;
2393
2394 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
2395 Op2Info);
2396 }
2397}
2398
2400 ScalarEvolution *SE,
2401 const SCEV *Ptr) {
2402 // Address computations in vectorized code with non-consecutive addresses will
2403 // likely result in more instructions compared to scalar code where the
2404 // computation can more often be merged into the index mode. The resulting
2405 // extra micro-ops can significantly decrease throughput.
2406 unsigned NumVectorInstToHideOverhead = 10;
2407 int MaxMergeDistance = 64;
2408
2409 if (Ty->isVectorTy() && SE &&
2410 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
2411 return NumVectorInstToHideOverhead;
2412
2413 // In many cases the address computation is not merged into the instruction
2414 // addressing mode.
2415 return 1;
2416}
2417
2419 Type *CondTy,
2420 CmpInst::Predicate VecPred,
2422 const Instruction *I) {
2423 // TODO: Handle other cost kinds.
2425 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2426 I);
2427
2428 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2429 // We don't lower some vector selects well that are wider than the register
2430 // width.
2431 if (isa<FixedVectorType>(ValTy) && ISD == ISD::SELECT) {
2432 // We would need this many instructions to hide the scalarization happening.
2433 const int AmortizationCost = 20;
2434
2435 // If VecPred is not set, check if we can get a predicate from the context
2436 // instruction, if its type matches the requested ValTy.
2437 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
2438 CmpInst::Predicate CurrentPred;
2439 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
2440 m_Value())))
2441 VecPred = CurrentPred;
2442 }
2443 // Check if we have a compare/select chain that can be lowered using
2444 // a (F)CMxx & BFI pair.
2445 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
2446 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
2447 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
2448 VecPred == CmpInst::FCMP_UNE) {
2449 static const auto ValidMinMaxTys = {
2452 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
2453
2454 auto LT = getTypeLegalizationCost(ValTy);
2455 if (any_of(ValidMinMaxTys, [&LT](MVT M) { return M == LT.second; }) ||
2456 (ST->hasFullFP16() &&
2457 any_of(ValidFP16MinMaxTys, [&LT](MVT M) { return M == LT.second; })))
2458 return LT.first;
2459 }
2460
2461 static const TypeConversionCostTblEntry
2462 VectorSelectTbl[] = {
2466 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
2467 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
2468 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
2469 };
2470
2471 EVT SelCondTy = TLI->getValueType(DL, CondTy);
2472 EVT SelValTy = TLI->getValueType(DL, ValTy);
2473 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
2474 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, ISD,
2475 SelCondTy.getSimpleVT(),
2476 SelValTy.getSimpleVT()))
2477 return Entry->Cost;
2478 }
2479 }
2480 // The base case handles scalable vectors fine for now, since it treats the
2481 // cost as 1 * legalization cost.
2482 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind, I);
2483}
2484
2486AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
2488 if (ST->requiresStrictAlign()) {
2489 // TODO: Add cost modeling for strict align. Misaligned loads expand to
2490 // a bunch of instructions when strict align is enabled.
2491 return Options;
2492 }
2493 Options.AllowOverlappingLoads = true;
2494 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
2495 Options.NumLoadsPerBlock = Options.MaxNumLoads;
2496 // TODO: Though vector loads usually perform well on AArch64, in some targets
2497 // they may wake up the FP unit, which raises the power consumption. Perhaps
2498 // they could be used with no holds barred (-O3).
2499 Options.LoadSizes = {8, 4, 2, 1};
2500 return Options;
2501}
2502
2504 return ST->hasSVE();
2505}
2506
2509 Align Alignment, unsigned AddressSpace,
2511 if (useNeonVector(Src))
2512 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2513 CostKind);
2514 auto LT = getTypeLegalizationCost(Src);
2515 if (!LT.first.isValid())
2517
2518 // The code-generator is currently not able to handle scalable vectors
2519 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2520 // it. This change will be removed when code-generation for these types is
2521 // sufficiently reliable.
2522 if (cast<VectorType>(Src)->getElementCount() == ElementCount::getScalable(1))
2524
2525 return LT.first;
2526}
2527
2528static unsigned getSVEGatherScatterOverhead(unsigned Opcode) {
2529 return Opcode == Instruction::Load ? SVEGatherOverhead : SVEScatterOverhead;
2530}
2531
2533 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
2534 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
2535 if (useNeonVector(DataTy))
2536 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
2537 Alignment, CostKind, I);
2538 auto *VT = cast<VectorType>(DataTy);
2539 auto LT = getTypeLegalizationCost(DataTy);
2540 if (!LT.first.isValid())
2542
2543 // The code-generator is currently not able to handle scalable vectors
2544 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2545 // it. This change will be removed when code-generation for these types is
2546 // sufficiently reliable.
2547 if (cast<VectorType>(DataTy)->getElementCount() ==
2550
2551 ElementCount LegalVF = LT.second.getVectorElementCount();
2552 InstructionCost MemOpCost =
2553 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
2554 {TTI::OK_AnyValue, TTI::OP_None}, I);
2555 // Add on an overhead cost for using gathers/scatters.
2556 // TODO: At the moment this is applied unilaterally for all CPUs, but at some
2557 // point we may want a per-CPU overhead.
2558 MemOpCost *= getSVEGatherScatterOverhead(Opcode);
2559 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
2560}
2561
2563 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
2564}
2565
2567 MaybeAlign Alignment,
2568 unsigned AddressSpace,
2570 TTI::OperandValueInfo OpInfo,
2571 const Instruction *I) {
2572 EVT VT = TLI->getValueType(DL, Ty, true);
2573 // Type legalization can't handle structs
2574 if (VT == MVT::Other)
2575 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
2576 CostKind);
2577
2578 auto LT = getTypeLegalizationCost(Ty);
2579 if (!LT.first.isValid())
2581
2582 // The code-generator is currently not able to handle scalable vectors
2583 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
2584 // it. This change will be removed when code-generation for these types is
2585 // sufficiently reliable.
2586 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
2587 if (VTy->getElementCount() == ElementCount::getScalable(1))
2589
2590 // TODO: consider latency as well for TCK_SizeAndLatency.
2592 return LT.first;
2593
2595 return 1;
2596
2597 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
2598 LT.second.is128BitVector() && (!Alignment || *Alignment < Align(16))) {
2599 // Unaligned stores are extremely inefficient. We don't split all
2600 // unaligned 128-bit stores because the negative impact that has shown in
2601 // practice on inlined block copy code.
2602 // We make such stores expensive so that we will only vectorize if there
2603 // are 6 other instructions getting vectorized.
2604 const int AmortizationCost = 6;
2605
2606 return LT.first * 2 * AmortizationCost;
2607 }
2608
2609 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
2610 if (Ty->isPtrOrPtrVectorTy())
2611 return LT.first;
2612
2613 // Check truncating stores and extending loads.
2614 if (useNeonVector(Ty) &&
2615 Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
2616 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
2617 if (VT == MVT::v4i8)
2618 return 2;
2619 // Otherwise we need to scalarize.
2620 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
2621 }
2622
2623 return LT.first;
2624}
2625
2627 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
2628 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
2629 bool UseMaskForCond, bool UseMaskForGaps) {
2630 assert(Factor >= 2 && "Invalid interleave factor");
2631 auto *VecVTy = cast<FixedVectorType>(VecTy);
2632
2633 if (!UseMaskForCond && !UseMaskForGaps &&
2634 Factor <= TLI->getMaxSupportedInterleaveFactor()) {
2635 unsigned NumElts = VecVTy->getNumElements();
2636 auto *SubVecTy =
2637 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
2638
2639 // ldN/stN only support legal vector types of size 64 or 128 in bits.
2640 // Accesses having vector types that are a multiple of 128 bits can be
2641 // matched to more than one ldN/stN instruction.
2642 bool UseScalable;
2643 if (NumElts % Factor == 0 &&
2644 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
2645 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
2646 }
2647
2648 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
2649 Alignment, AddressSpace, CostKind,
2650 UseMaskForCond, UseMaskForGaps);
2651}
2652
2657 for (auto *I : Tys) {
2658 if (!I->isVectorTy())
2659 continue;
2660 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
2661 128)
2662 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
2663 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
2664 }
2665 return Cost;
2666}
2667
2669 return ST->getMaxInterleaveFactor();
2670}
2671
2672// For Falkor, we want to avoid having too many strided loads in a loop since
2673// that can exhaust the HW prefetcher resources. We adjust the unroller
2674// MaxCount preference below to attempt to ensure unrolling doesn't create too
2675// many strided loads.
2676static void
2679 enum { MaxStridedLoads = 7 };
2680 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
2681 int StridedLoads = 0;
2682 // FIXME? We could make this more precise by looking at the CFG and
2683 // e.g. not counting loads in each side of an if-then-else diamond.
2684 for (const auto BB : L->blocks()) {
2685 for (auto &I : *BB) {
2686 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
2687 if (!LMemI)
2688 continue;
2689
2690 Value *PtrValue = LMemI->getPointerOperand();
2691 if (L->isLoopInvariant(PtrValue))
2692 continue;
2693
2694 const SCEV *LSCEV = SE.getSCEV(PtrValue);
2695 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
2696 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
2697 continue;
2698
2699 // FIXME? We could take pairing of unrolled load copies into account
2700 // by looking at the AddRec, but we would probably have to limit this
2701 // to loops with no stores or other memory optimization barriers.
2702 ++StridedLoads;
2703 // We've seen enough strided loads that seeing more won't make a
2704 // difference.
2705 if (StridedLoads > MaxStridedLoads / 2)
2706 return StridedLoads;
2707 }
2708 }
2709 return StridedLoads;
2710 };
2711
2712 int StridedLoads = countStridedLoads(L, SE);
2713 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
2714 << " strided loads\n");
2715 // Pick the largest power of 2 unroll count that won't result in too many
2716 // strided loads.
2717 if (StridedLoads) {
2718 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
2719 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
2720 << UP.MaxCount << '\n');
2721 }
2722}
2723
2727 // Enable partial unrolling and runtime unrolling.
2728 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
2729
2730 UP.UpperBound = true;
2731
2732 // For inner loop, it is more likely to be a hot one, and the runtime check
2733 // can be promoted out from LICM pass, so the overhead is less, let's try
2734 // a larger threshold to unroll more loops.
2735 if (L->getLoopDepth() > 1)
2736 UP.PartialThreshold *= 2;
2737
2738 // Disable partial & runtime unrolling on -Os.
2740
2744
2745 // Scan the loop: don't unroll loops with calls as this could prevent
2746 // inlining. Don't unroll vector loops either, as they don't benefit much from
2747 // unrolling.
2748 for (auto *BB : L->getBlocks()) {
2749 for (auto &I : *BB) {
2750 // Don't unroll vectorised loop.
2751 if (I.getType()->isVectorTy())
2752 return;
2753
2754 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2755 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2756 if (!isLoweredToCall(F))
2757 continue;
2758 }
2759 return;
2760 }
2761 }
2762 }
2763
2764 // Enable runtime unrolling for in-order models
2765 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
2766 // checking for that case, we can ensure that the default behaviour is
2767 // unchanged
2769 !ST->getSchedModel().isOutOfOrder()) {
2770 UP.Runtime = true;
2771 UP.Partial = true;
2772 UP.UnrollRemainder = true;
2774
2775 UP.UnrollAndJam = true;
2777 }
2778}
2779
2783}
2784
2786 Type *ExpectedType) {
2787 switch (Inst->getIntrinsicID()) {
2788 default:
2789 return nullptr;
2790 case Intrinsic::aarch64_neon_st2:
2791 case Intrinsic::aarch64_neon_st3:
2792 case Intrinsic::aarch64_neon_st4: {
2793 // Create a struct type
2794 StructType *ST = dyn_cast<StructType>(ExpectedType);
2795 if (!ST)
2796 return nullptr;
2797 unsigned NumElts = Inst->arg_size() - 1;
2798 if (ST->getNumElements() != NumElts)
2799 return nullptr;
2800 for (unsigned i = 0, e = NumElts; i != e; ++i) {
2801 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
2802 return nullptr;
2803 }
2804 Value *Res = PoisonValue::get(ExpectedType);
2805 IRBuilder<> Builder(Inst);
2806 for (unsigned i = 0, e = NumElts; i != e; ++i) {
2807 Value *L = Inst->getArgOperand(i);
2808 Res = Builder.CreateInsertValue(Res, L, i);
2809 }
2810 return Res;
2811 }
2812 case Intrinsic::aarch64_neon_ld2:
2813 case Intrinsic::aarch64_neon_ld3:
2814 case Intrinsic::aarch64_neon_ld4:
2815 if (Inst->getType() == ExpectedType)
2816 return Inst;
2817 return nullptr;
2818 }
2819}
2820
2822 MemIntrinsicInfo &Info) {
2823 switch (Inst->getIntrinsicID()) {
2824 default:
2825 break;
2826 case Intrinsic::aarch64_neon_ld2:
2827 case Intrinsic::aarch64_neon_ld3:
2828 case Intrinsic::aarch64_neon_ld4:
2829 Info.ReadMem = true;
2830 Info.WriteMem = false;
2831 Info.PtrVal = Inst->getArgOperand(0);
2832 break;
2833 case Intrinsic::aarch64_neon_st2:
2834 case Intrinsic::aarch64_neon_st3:
2835 case Intrinsic::aarch64_neon_st4:
2836 Info.ReadMem = false;
2837 Info.WriteMem = true;
2838 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
2839 break;
2840 }
2841
2842 switch (Inst->getIntrinsicID()) {
2843 default:
2844 return false;
2845 case Intrinsic::aarch64_neon_ld2:
2846 case Intrinsic::aarch64_neon_st2:
2847 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
2848 break;
2849 case Intrinsic::aarch64_neon_ld3:
2850 case Intrinsic::aarch64_neon_st3:
2851 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
2852 break;
2853 case Intrinsic::aarch64_neon_ld4:
2854 case Intrinsic::aarch64_neon_st4:
2855 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
2856 break;
2857 }
2858 return true;
2859}
2860
2861/// See if \p I should be considered for address type promotion. We check if \p
2862/// I is a sext with right type and used in memory accesses. If it used in a
2863/// "complex" getelementptr, we allow it to be promoted without finding other
2864/// sext instructions that sign extended the same initial value. A getelementptr
2865/// is considered as "complex" if it has more than 2 operands.
2867 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
2868 bool Considerable = false;
2869 AllowPromotionWithoutCommonHeader = false;
2870 if (!isa<SExtInst>(&I))
2871 return false;
2872 Type *ConsideredSExtType =
2873 Type::getInt64Ty(I.getParent()->getParent()->getContext());
2874 if (I.getType() != ConsideredSExtType)
2875 return false;
2876 // See if the sext is the one with the right type and used in at least one
2877 // GetElementPtrInst.
2878 for (const User *U : I.users()) {
2879 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
2880 Considerable = true;
2881 // A getelementptr is considered as "complex" if it has more than 2
2882 // operands. We will promote a SExt used in such complex GEP as we
2883 // expect some computation to be merged if they are done on 64 bits.
2884 if (GEPInst->getNumOperands() > 2) {
2885 AllowPromotionWithoutCommonHeader = true;
2886 break;
2887 }
2888 }
2889 }
2890 return Considerable;
2891}
2892
2894 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
2895 if (!VF.isScalable())
2896 return true;
2897
2898 Type *Ty = RdxDesc.getRecurrenceType();
2900 return false;
2901
2902 switch (RdxDesc.getRecurrenceKind()) {
2903 case RecurKind::Add:
2904 case RecurKind::FAdd:
2905 case RecurKind::And:
2906 case RecurKind::Or:
2907 case RecurKind::Xor:
2908 case RecurKind::SMin:
2909 case RecurKind::SMax:
2910 case RecurKind::UMin:
2911 case RecurKind::UMax:
2912 case RecurKind::FMin:
2913 case RecurKind::FMax:
2916 case RecurKind::FMulAdd:
2917 return true;
2918 default:
2919 return false;
2920 }
2921}
2922
2925 bool IsUnsigned,
2927 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2928
2929 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
2930 return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
2931
2932 assert((isa<ScalableVectorType>(Ty) == isa<ScalableVectorType>(CondTy)) &&
2933 "Both vector needs to be equally scalable");
2934
2935 InstructionCost LegalizationCost = 0;
2936 if (LT.first > 1) {
2937 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2938 unsigned MinMaxOpcode =
2939 Ty->isFPOrFPVectorTy()
2940 ? Intrinsic::maxnum
2941 : (IsUnsigned ? Intrinsic::umin : Intrinsic::smin);
2942 IntrinsicCostAttributes Attrs(MinMaxOpcode, LegalVTy, {LegalVTy, LegalVTy});
2943 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
2944 }
2945
2946 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
2947}
2948
2950 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) {
2951 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2952 InstructionCost LegalizationCost = 0;
2953 if (LT.first > 1) {
2954 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
2955 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
2956 LegalizationCost *= LT.first - 1;
2957 }
2958
2959 int ISD = TLI->InstructionOpcodeToISD(Opcode);
2960 assert(ISD && "Invalid opcode");
2961 // Add the final reduction cost for the legal horizontal reduction
2962 switch (ISD) {
2963 case ISD::ADD:
2964 case ISD::AND:
2965 case ISD::OR:
2966 case ISD::XOR:
2967 case ISD::FADD:
2968 return LegalizationCost + 2;
2969 default:
2971 }
2972}
2973
2976 std::optional<FastMathFlags> FMF,
2979 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
2980 InstructionCost BaseCost =
2981 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2982 // Add on extra cost to reflect the extra overhead on some CPUs. We still
2983 // end up vectorizing for more computationally intensive loops.
2984 return BaseCost + FixedVTy->getNumElements();
2985 }
2986
2987 if (Opcode != Instruction::FAdd)
2989
2990 auto *VTy = cast<ScalableVectorType>(ValTy);
2992 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
2993 Cost *= getMaxNumElements(VTy->getElementCount());
2994 return Cost;
2995 }
2996
2997 if (isa<ScalableVectorType>(ValTy))
2998 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
2999
3000 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
3001 MVT MTy = LT.second;
3002 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3003 assert(ISD && "Invalid opcode");
3004
3005 // Horizontal adds can use the 'addv' instruction. We model the cost of these
3006 // instructions as twice a normal vector add, plus 1 for each legalization
3007 // step (LT.first). This is the only arithmetic vector reduction operation for
3008 // which we have an instruction.
3009 // OR, XOR and AND costs should match the codegen from:
3010 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
3011 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
3012 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
3013 static const CostTblEntry CostTblNoPairwise[]{
3014 {ISD::ADD, MVT::v8i8, 2},
3015 {ISD::ADD, MVT::v16i8, 2},
3016 {ISD::ADD, MVT::v4i16, 2},
3017 {ISD::ADD, MVT::v8i16, 2},
3018 {ISD::ADD, MVT::v4i32, 2},
3019 {ISD::ADD, MVT::v2i64, 2},
3020 {ISD::OR, MVT::v8i8, 15},
3021 {ISD::OR, MVT::v16i8, 17},
3022 {ISD::OR, MVT::v4i16, 7},
3023 {ISD::OR, MVT::v8i16, 9},
3024 {ISD::OR, MVT::v2i32, 3},
3025 {ISD::OR, MVT::v4i32, 5},
3026 {ISD::OR, MVT::v2i64, 3},
3027 {ISD::XOR, MVT::v8i8, 15},
3028 {ISD::XOR, MVT::v16i8, 17},
3029 {ISD::XOR, MVT::v4i16, 7},
3030 {ISD::XOR, MVT::v8i16, 9},
3031 {ISD::XOR, MVT::v2i32, 3},
3032 {ISD::XOR, MVT::v4i32, 5},
3033 {ISD::XOR, MVT::v2i64, 3},
3034 {ISD::AND, MVT::v8i8, 15},
3035 {ISD::AND, MVT::v16i8, 17},
3036 {ISD::AND, MVT::v4i16, 7},
3037 {ISD::AND, MVT::v8i16, 9},
3038 {ISD::AND, MVT::v2i32, 3},
3039 {ISD::AND, MVT::v4i32, 5},
3040 {ISD::AND, MVT::v2i64, 3},
3041 };
3042 switch (ISD) {
3043 default:
3044 break;
3045 case ISD::ADD:
3046 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
3047 return (LT.first - 1) + Entry->Cost;
3048 break;
3049 case ISD::XOR:
3050 case ISD::AND:
3051 case ISD::OR:
3052 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
3053 if (!Entry)
3054 break;
3055 auto *ValVTy = cast<FixedVectorType>(ValTy);
3056 if (!ValVTy->getElementType()->isIntegerTy(1) &&
3057 MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
3058 isPowerOf2_32(ValVTy->getNumElements())) {
3059 InstructionCost ExtraCost = 0;
3060 if (LT.first != 1) {
3061 // Type needs to be split, so there is an extra cost of LT.first - 1
3062 // arithmetic ops.
3063 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
3064 MTy.getVectorNumElements());
3065 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
3066 ExtraCost *= LT.first - 1;
3067 }
3068 return Entry->Cost + ExtraCost;
3069 }
3070 break;
3071 }
3072 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
3073}
3074
3076 static const CostTblEntry ShuffleTbl[] = {
3090 };
3091
3092 // The code-generator is currently not able to handle scalable vectors
3093 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
3094 // it. This change will be removed when code-generation for these types is
3095 // sufficiently reliable.
3098
3099 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3100 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
3102 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
3103 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
3104 : LT.second;
3105 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
3106 InstructionCost LegalizationCost = 0;
3107 if (Index < 0) {
3108 LegalizationCost =
3109 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
3111 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
3113 }
3114
3115 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
3116 // Cost performed on a promoted type.
3117 if (LT.second.getScalarType() == MVT::i1) {
3118 LegalizationCost +=
3119 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
3121 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
3123 }
3124 const auto *Entry =
3125 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
3126 assert(Entry && "Illegal Type for Splice");
3127 LegalizationCost += Entry->Cost;
3128 return LegalizationCost * LT.first;
3129}
3130
3132 VectorType *Tp,
3133 ArrayRef<int> Mask,
3135 int Index, VectorType *SubTp,
3137 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
3138 // If we have a Mask, and the LT is being legalized somehow, split the Mask
3139 // into smaller vectors and sum the cost of each shuffle.
3140 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
3141 Tp->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
3142 cast<FixedVectorType>(Tp)->getNumElements() >
3143 LT.second.getVectorNumElements() &&
3144 !Index && !SubTp) {
3145 unsigned TpNumElts = cast<FixedVectorType>(Tp)->getNumElements();
3146 assert(Mask.size() == TpNumElts && "Expected Mask and Tp size to match!");
3147 unsigned LTNumElts = LT.second.getVectorNumElements();
3148 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
3149 VectorType *NTp =
3150 VectorType::get(Tp->getScalarType(), LT.second.getVectorElementCount());
3152 for (unsigned N = 0; N < NumVecs; N++) {
3153 SmallVector<int> NMask;
3154 // Split the existing mask into chunks of size LTNumElts. Track the source
3155 // sub-vectors to ensure the result has at most 2 inputs.
3156 unsigned Source1, Source2;
3157 unsigned NumSources = 0;
3158 for (unsigned E = 0; E < LTNumElts; E++) {
3159 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
3160 : UndefMaskElem;
3161 if (MaskElt < 0) {
3162 NMask.push_back(UndefMaskElem);
3163 continue;
3164 }
3165
3166 // Calculate which source from the input this comes from and whether it
3167 // is new to us.
3168 unsigned Source = MaskElt / LTNumElts;
3169 if (NumSources == 0) {
3170 Source1 = Source;
3171 NumSources = 1;
3172 } else if (NumSources == 1 && Source != Source1) {
3173 Source2 = Source;
3174 NumSources = 2;
3175 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
3176 NumSources++;
3177 }
3178
3179 // Add to the new mask. For the NumSources>2 case these are not correct,
3180 // but are only used for the modular lane number.
3181 if (Source == Source1)
3182 NMask.push_back(MaskElt % LTNumElts);
3183 else if (Source == Source2)
3184 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
3185 else
3186 NMask.push_back(MaskElt % LTNumElts);
3187 }
3188 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
3189 // getShuffleCost. If not then cost it using the worst case.
3190 if (NumSources <= 2)
3191 Cost += getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
3193 NTp, NMask, CostKind, 0, nullptr, Args);
3194 else if (any_of(enumerate(NMask), [&](const auto &ME) {
3195 return ME.value() % LTNumElts == ME.index();
3196 }))
3197 Cost += LTNumElts - 1;
3198 else
3199 Cost += LTNumElts;
3200 }
3201 return Cost;
3202 }
3203
3204 Kind = improveShuffleKindFromMask(Kind, Mask);
3205
3206 // Check for broadcast loads.
3207 if (Kind == TTI::SK_Broadcast) {
3208 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
3209 if (IsLoad && LT.second.isVector() &&
3211 LT.second.getVectorElementCount()))
3212 return 0; // broadcast is handled by ld1r
3213 }
3214
3215 // If we have 4 elements for the shuffle and a Mask, get the cost straight
3216 // from the perfect shuffle tables.
3217 if (Mask.size() == 4 && Tp->getElementCount() == ElementCount::getFixed(4) &&
3218 (Tp->getScalarSizeInBits() == 16 || Tp->getScalarSizeInBits() == 32) &&
3219 all_of(Mask, [](int E) { return E < 8; }))
3220 return getPerfectShuffleCost(Mask);
3221
3222 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
3223 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
3224 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
3225 static const CostTblEntry ShuffleTbl[] = {
3226 // Broadcast shuffle kinds can be performed with 'dup'.
3237 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
3238 // 'zip1/zip2' instructions.
3249 // Select shuffle kinds.
3250 // TODO: handle vXi8/vXi16.
3251 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
3252 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
3253 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
3254 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
3255 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
3256 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
3257 // PermuteSingleSrc shuffle kinds.
3259 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
3262 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
3264 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
3265 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
3267 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
3268 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
3269 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
3270 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
3271 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
3272 // Reverse can be lowered with `rev`.
3273 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
3274 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
3275 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
3276 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
3277 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
3278 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
3279 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
3280 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
3281 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
3282 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
3283 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
3284 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
3285 // Splice can all be lowered as `ext`.
3300 // Broadcast shuffle kinds for scalable vectors
3318 // Handle the cases for vector.reverse with scalable vectors
3336 };
3337 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
3338 return LT.first * Entry->Cost;
3339 }
3340
3341 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(Tp))
3342 return getSpliceCost(Tp, Index);
3343
3344 // Inserting a subvector can often be done with either a D, S or H register
3345 // move, so long as the inserted vector is "aligned".
3346 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
3347 LT.second.getSizeInBits() <= 128 && SubTp) {
3348 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
3349 if (SubLT.second.isVector()) {
3350 int NumElts = LT.second.getVectorNumElements();
3351 int NumSubElts = SubLT.second.getVectorNumElements();
3352 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
3353 return SubLT.first;
3354 }
3355 }
3356
3357 return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
3358}
3359
3361 Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
3363 InterleavedAccessInfo *IAI) {
3364 if (!ST->hasSVE() || TailFoldingKindLoc == TailFoldingKind::TFDisabled)
3365 return false;
3366
3367 // We don't currently support vectorisation with interleaving for SVE - with
3368 // such loops we're better off not using tail-folding. This gives us a chance
3369 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
3370 if (IAI->hasGroups())
3371 return false;
3372
3373 TailFoldingKind Required; // Defaults to 0.
3374 if (LVL->getReductionVars().size())
3375 Required.add(TailFoldingKind::TFReductions);
3376 if (LVL->getFixedOrderRecurrences().size())
3377 Required.add(TailFoldingKind::TFRecurrences);
3378 if (!Required)
3379 Required.add(TailFoldingKind::TFSimple);
3380
3381 return (TailFoldingKindLoc & Required) == Required;
3382}
3383
3386 int64_t BaseOffset, bool HasBaseReg,
3387 int64_t Scale, unsigned AddrSpace) const {
3388 // Scaling factors are not free at all.
3389 // Operands | Rt Latency
3390 // -------------------------------------------
3391 // Rt, [Xn, Xm] | 4
3392 // -------------------------------------------
3393 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
3394 // Rt, [Xn, Wm, <extend> #imm] |
3396 AM.BaseGV = BaseGV;
3397 AM.BaseOffs = BaseOffset;
3398 AM.HasBaseReg = HasBaseReg;
3399 AM.Scale = Scale;
3400 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
3401 // Scale represents reg2 * scale, thus account for 1 if
3402 // it is not equal to 0 or 1.
3403 return AM.Scale != 0 && AM.Scale != 1;
3404 return -1;
3405}
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
static unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
cl::opt< TailFoldingKind, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE:" "\ndisabled No loop types will vectorize using tail-folding" "\ndefault Uses the default tail-folding settings for the target " "CPU" "\nall All legal loop types will vectorize using tail-folding" "\nsimple Use tail-folding for simple loops (not reductions or " "recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences"), cl::location(TailFoldingKindLoc))
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVEVectorMul(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
TailFoldingKind TailFoldingKindLoc
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfo::Concept conforming object specific to the AArch64 target machine.
amdgpu Simplify well known AMD library false FunctionCallee Callee
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
amdgpu AMDGPU Register Bank Select
assume Assume Builder
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
return RetTy
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
#define LLVM_DEBUG(X)
Definition: Debug.h:101
This file provides the interface for the instcombine pass implementation.
static LVOptions Options
Definition: LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
IntegerType * Int32Ty
#define P(N)
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
bool isStreamingSVEModeDisabled() const
unsigned getVectorInsertExtractBaseCost() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned getMaxInterleaveFactor() const
bool useSVEForFixedLengthVectors() const
unsigned getMinSVEVectorSizeInBits() const
InstructionCost getSpliceCost(VectorType *Tp, int Index)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isElementTypeLegalForScalableVector(Type *Ty) const
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getMaxInterleaveFactor(unsigned VF)
bool useNeonVector(const Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
bool preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI, DominatorTree *DT, LoopVectorizationLegality *LVL, InterleavedAccessInfo *IAI)
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
Class for arbitrary precision integers.
Definition: APInt.h:75
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition: APInt.h:441
void negate()
Negate this APInt in place.
Definition: APInt.h:1421
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition: APInt.cpp:1002
unsigned logBase2() const
Definition: APInt.h:1700
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition: APInt.h:815
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition: APInt.h:432
int64_t getSExtValue() const
Get sign extended value.
Definition: APInt.h:1516
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
A cache of @llvm.assume calls within a function.
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
Definition: BasicTTIImpl.h:538
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:849
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
Definition: BasicTTIImpl.h:963
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr)
Definition: BasicTTIImpl.h:328
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:610
InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:813
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask) const
Definition: BasicTTIImpl.h:927
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
Definition: BasicTTIImpl.h:993
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Instruction *CopyO, const Twine &Name="", Instruction *InsertBefore=nullptr)
Definition: InstrTypes.h:248
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1351
unsigned arg_size() const
Definition: InstrTypes.h:1349
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition: InstrTypes.h:718
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition: InstrTypes.h:721
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition: InstrTypes.h:724
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition: InstrTypes.h:722
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition: InstrTypes.h:723
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition: InstrTypes.h:725
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition: InstrTypes.h:734
bool isIntPredicate() const
Definition: InstrTypes.h:826
static ConstantAggregateZero * get(Type *Ty)
Definition: Constants.cpp:1595
This is the shared class of boolean and integer constants.
Definition: Constants.h:78
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
Definition: Constants.cpp:887
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition: Constants.h:132
This is an important base class in LLVM.
Definition: Constant.h:41
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
Definition: Constants.cpp:356
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:114
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition: Dominators.h:166
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition: TypeSize.h:294
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition: TypeSize.h:291
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:21
bool allowContract() const
Definition: FMF.h:71
Container class for subtarget features.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:698
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:940
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2550
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
The core instruction combiner logic.
Definition: InstCombiner.h:45
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Definition: InstCombiner.h:418
static InstructionCost getInvalid(CostType Val=0)
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:70
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
Definition: DerivedTypes.h:40
Drive the analysis of interleaved memory accesses in the loop.
Definition: VectorUtils.h:765
bool hasGroups() const
Returns true if we have any interleave groups.
Definition: VectorUtils.h:829
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:177
Value * getPointerOperand()
Definition: Instructions.h:264
unsigned getLoopDepth() const
Return the nesting level of this loop.
Definition: LoopInfo.h:97
iterator_range< block_iterator > blocks() const
Definition: LoopInfo.h:195
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:188
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:547
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:60
Machine Value Type.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
size_type size() const
Definition: MapVector.h:61
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:398
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Definition: DerivedTypes.h:651
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
Definition: Constants.cpp:1759
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Definition: IVDescriptors.h:69
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNewZAInterface() const
std::optional< bool > requiresSMChange(const SMEAttrs &Callee, bool BodyOverridesInterface=false) const
bool requiresLazySave(const SMEAttrs &Callee) const
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition: Type.cpp:719
The main scalar evolution driver.
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
size_type size() const
Definition: SmallPtrSet.h:93
size_t size() const
Definition: SmallVector.h:91
iterator insert(iterator I, T &&Elt)
Definition: SmallVector.h:809
void resize(size_type N)
Definition: SmallVector.h:642
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
An instruction for storing to memory.
Definition: Instructions.h:301
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition: StringRef.h:688
Class to represent struct types.
Definition: DerivedTypes.h:213
Provides information about what library functions are available for the current target.
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
bool isLoweredToCall(const Function *F) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:322
static constexpr TypeSize getScalable(ScalarTy MinimunSize)
Definition: TypeSize.h:325
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:258
PointerType * getPointerTo(unsigned AddrSpace=0) const
Return a pointer to the current type.
static IntegerType * getInt1Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition: Type.h:146
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isFP128Ty() const
Return true if this is 'fp128'.
Definition: Type.h:163
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
Definition: Type.h:255
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition: Type.h:222
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:210
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition: Type.h:341
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition: Value.cpp:918
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:994
void takeName(Value *V)
Transfer the name from V to this value.
Definition: Value.cpp:381
Base class of all SIMD vector types.
Definition: DerivedTypes.h:389
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Definition: DerivedTypes.h:627
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Definition: Type.cpp:682
Type * getElementType() const
Definition: DerivedTypes.h:422
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition: TypeSize.h:166
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static constexpr unsigned SVEBitsPerBlock
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition: ISDOpcodes.h:773
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition: ISDOpcodes.h:885
@ SIGN_EXTEND
Conversion operators.
Definition: ISDOpcodes.h:760
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:910
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition: ISDOpcodes.h:713
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition: ISDOpcodes.h:637
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:691
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition: ISDOpcodes.h:763
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition: ISDOpcodes.h:870
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition: ISDOpcodes.h:819
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:666
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition: ISDOpcodes.h:852
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition: ISDOpcodes.h:769
Flag
These should be considered private to the implementation of the MCInstrDesc class.
Definition: MCInstrDesc.h:148
specific_intval< false > m_SpecificInt(APInt V)
Match a specific integer value or vector with all elements equal to the value.
Definition: PatternMatch.h:854
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
Definition: PatternMatch.h:84
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
Definition: PatternMatch.h:772
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
Definition: PatternMatch.h:485
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
Definition: PatternMatch.h:517
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
OneUse_match< T > m_OneUse(const T &SubPattern)
Definition: PatternMatch.h:67
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
Definition: PatternMatch.h:89
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:818
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
initializer< Ty > init(const Ty &Val)