23#include "llvm/IR/IntrinsicsAArch64.h"
35#define DEBUG_TYPE "aarch64tti"
41 "sve-prefer-fixed-over-scalable-if-equal",
cl::Hidden);
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
63 cl::desc(
"Penalty of inlining a call that requires a change to PSTATE.SM"));
74 cl::desc(
"The cost of a histcnt instruction"));
78 cl::desc(
"The number of instructions to search for a redundant dmb"));
82 cl::desc(
"Threshold for forced unrolling of small loops in AArch64"));
85class TailFoldingOption {
100 bool NeedsDefault =
true;
104 void setNeedsDefault(
bool V) { NeedsDefault =
V; }
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
124 Bits &= ~DisableBits;
130 errs() <<
"invalid argument '" << Opt
131 <<
"' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
139 void operator=(
const std::string &Val) {
148 setNeedsDefault(
false);
151 StringRef(Val).split(TailFoldTypes,
'+', -1,
false);
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] ==
"disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] ==
"all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] ==
"default")
159 setNeedsDefault(
true);
160 else if (TailFoldTypes[0] ==
"simple")
161 setInitialBits(TailFoldingOpts::Simple);
164 setInitialBits(TailFoldingOpts::Disabled);
167 for (
unsigned I = StartIdx;
I < TailFoldTypes.
size();
I++) {
168 if (TailFoldTypes[
I] ==
"reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[
I] ==
"recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[
I] ==
"reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[
I] ==
"noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[
I] ==
"norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[
I] ==
"noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
200 "\ndefault (Initial) Uses the default tail-folding settings for "
202 "\nall (Initial) All legal loop types will vectorize using "
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
213 "\nnoreverse Inverse of above"),
258 TTI->isMultiversionedFunction(
F) ?
"fmv-features" :
"target-features";
259 StringRef FeatureStr =
F.getFnAttribute(AttributeStr).getValueAsString();
260 FeatureStr.
split(Features,
",");
276 return F.hasFnAttribute(
"fmv-features");
325 auto FVTy = dyn_cast<FixedVectorType>(Ty);
327 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
336 unsigned DefaultCallPenalty)
const {
361 if (
F ==
Call.getCaller())
367 return DefaultCallPenalty;
378 ST->isSVEorStreamingSVEAvailable() &&
379 !ST->disableMaximizeScalableBandwidth();
403 assert(Ty->isIntegerTy());
405 unsigned BitSize = Ty->getPrimitiveSizeInBits();
412 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
417 for (
unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
423 return std::max<InstructionCost>(1,
Cost);
430 assert(Ty->isIntegerTy());
432 unsigned BitSize = Ty->getPrimitiveSizeInBits();
438 unsigned ImmIdx = ~0U;
442 case Instruction::GetElementPtr:
447 case Instruction::Store:
450 case Instruction::Add:
451 case Instruction::Sub:
452 case Instruction::Mul:
453 case Instruction::UDiv:
454 case Instruction::SDiv:
455 case Instruction::URem:
456 case Instruction::SRem:
457 case Instruction::And:
458 case Instruction::Or:
459 case Instruction::Xor:
460 case Instruction::ICmp:
464 case Instruction::Shl:
465 case Instruction::LShr:
466 case Instruction::AShr:
470 case Instruction::Trunc:
471 case Instruction::ZExt:
472 case Instruction::SExt:
473 case Instruction::IntToPtr:
474 case Instruction::PtrToInt:
475 case Instruction::BitCast:
476 case Instruction::PHI:
477 case Instruction::Call:
478 case Instruction::Select:
479 case Instruction::Ret:
480 case Instruction::Load:
485 int NumConstants = (BitSize + 63) / 64;
498 assert(Ty->isIntegerTy());
500 unsigned BitSize = Ty->getPrimitiveSizeInBits();
509 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
515 case Intrinsic::sadd_with_overflow:
516 case Intrinsic::uadd_with_overflow:
517 case Intrinsic::ssub_with_overflow:
518 case Intrinsic::usub_with_overflow:
519 case Intrinsic::smul_with_overflow:
520 case Intrinsic::umul_with_overflow:
522 int NumConstants = (BitSize + 63) / 64;
529 case Intrinsic::experimental_stackmap:
530 if ((Idx < 2) || (Imm.getBitWidth() <= 64 &&
isInt<64>(Imm.getSExtValue())))
533 case Intrinsic::experimental_patchpoint_void:
534 case Intrinsic::experimental_patchpoint:
535 if ((Idx < 4) || (Imm.getBitWidth() <= 64 &&
isInt<64>(Imm.getSExtValue())))
538 case Intrinsic::experimental_gc_statepoint:
539 if ((Idx < 5) || (Imm.getBitWidth() <= 64 &&
isInt<64>(Imm.getSExtValue())))
549 if (TyWidth == 32 || TyWidth == 64)
558 return ST->getSchedModel().MispredictPenalty;
579 unsigned TotalHistCnts = 1;
589 unsigned EC = VTy->getElementCount().getKnownMinValue();
594 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
596 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
600 TotalHistCnts = EC / NaturalVectorWidth;
620 switch (ICA.
getID()) {
621 case Intrinsic::experimental_vector_histogram_add: {
628 case Intrinsic::clmul: {
633 if (LT.second == MVT::v8i8 || LT.second == MVT::v16i8)
637 if (TLI->getValueType(
DL, RetTy,
true) == MVT::i8) {
642 -1,
nullptr,
nullptr) *
645 -1,
nullptr,
nullptr);
649 if (LT.second.SimpleTy == MVT::nxv2i64)
650 if (ST->hasSVEAES() && (ST->isSVEAvailable() || ST->hasSSVE_AES()))
653 if (ST->hasSVE2() || ST->hasSME()) {
654 switch (LT.second.SimpleTy) {
669 if (LT.second.SimpleTy == MVT::nxv2i64)
673 switch (LT.second.SimpleTy) {
683 -1,
nullptr,
nullptr) *
686 -1,
nullptr,
nullptr));
695 return LT.first * 11;
697 return LT.first * 14;
704 case Intrinsic::umin:
705 case Intrinsic::umax:
706 case Intrinsic::smin:
707 case Intrinsic::smax: {
708 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
709 MVT::v8i16, MVT::v2i32, MVT::v4i32,
710 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
714 if (LT.second == MVT::v2i64)
720 case Intrinsic::scmp:
721 case Intrinsic::ucmp: {
723 {Intrinsic::scmp, MVT::i32, 3},
724 {Intrinsic::scmp, MVT::i64, 3},
725 {Intrinsic::scmp, MVT::v8i8, 3},
726 {Intrinsic::scmp, MVT::v16i8, 3},
727 {Intrinsic::scmp, MVT::v4i16, 3},
728 {Intrinsic::scmp, MVT::v8i16, 3},
729 {Intrinsic::scmp, MVT::v2i32, 3},
730 {Intrinsic::scmp, MVT::v4i32, 3},
731 {Intrinsic::scmp, MVT::v1i64, 3},
732 {Intrinsic::scmp, MVT::v2i64, 3},
738 return Entry->Cost * LT.first;
741 case Intrinsic::sadd_sat:
742 case Intrinsic::ssub_sat:
743 case Intrinsic::uadd_sat:
744 case Intrinsic::usub_sat: {
745 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
746 MVT::v8i16, MVT::v2i32, MVT::v4i32,
752 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
754 return LT.first * Instrs;
759 if (ST->isSVEAvailable() && VectorSize >= 128 &&
isPowerOf2_64(VectorSize))
760 return LT.first * Instrs;
764 case Intrinsic::abs: {
765 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
766 MVT::v8i16, MVT::v2i32, MVT::v4i32,
767 MVT::v2i64, MVT::nxv16i8, MVT::nxv8i16,
768 MVT::nxv4i32, MVT::nxv2i64};
774 case Intrinsic::bswap: {
775 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
776 MVT::v4i32, MVT::v2i64};
779 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
784 case Intrinsic::fmuladd: {
789 (EltTy->
isHalfTy() && ST->hasFullFP16()))
793 case Intrinsic::stepvector: {
802 Cost += AddCost * (LT.first - 1);
806 case Intrinsic::vector_extract:
807 case Intrinsic::vector_insert: {
820 bool IsExtract = ICA.
getID() == Intrinsic::vector_extract;
821 EVT SubVecVT = IsExtract ? getTLI()->getValueType(
DL, RetTy)
829 getTLI()->getTypeConversion(
C, SubVecVT);
831 getTLI()->getTypeConversion(
C, VecVT);
839 case Intrinsic::bitreverse: {
841 {Intrinsic::bitreverse, MVT::i32, 1},
842 {Intrinsic::bitreverse, MVT::i64, 1},
843 {Intrinsic::bitreverse, MVT::v8i8, 1},
844 {Intrinsic::bitreverse, MVT::v16i8, 1},
845 {Intrinsic::bitreverse, MVT::v4i16, 2},
846 {Intrinsic::bitreverse, MVT::v8i16, 2},
847 {Intrinsic::bitreverse, MVT::v2i32, 2},
848 {Intrinsic::bitreverse, MVT::v4i32, 2},
849 {Intrinsic::bitreverse, MVT::v1i64, 2},
850 {Intrinsic::bitreverse, MVT::v2i64, 2},
858 if (TLI->getValueType(
DL, RetTy,
true) == MVT::i8 ||
859 TLI->getValueType(
DL, RetTy,
true) == MVT::i16)
860 return LegalisationCost.first * Entry->Cost + 1;
862 return LegalisationCost.first * Entry->Cost;
866 case Intrinsic::ctpop: {
867 if (!ST->hasNEON()) {
899 RetTy->getScalarSizeInBits()
902 return LT.first * Entry->Cost + ExtraCost;
906 case Intrinsic::sadd_with_overflow:
907 case Intrinsic::uadd_with_overflow:
908 case Intrinsic::ssub_with_overflow:
909 case Intrinsic::usub_with_overflow:
910 case Intrinsic::smul_with_overflow:
911 case Intrinsic::umul_with_overflow: {
913 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
914 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
915 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
916 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
917 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
918 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
919 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
920 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
921 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
922 {Intrinsic::usub_with_overflow, MVT::i8, 3},
923 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
924 {Intrinsic::usub_with_overflow, MVT::i16, 3},
925 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
926 {Intrinsic::usub_with_overflow, MVT::i32, 1},
927 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
928 {Intrinsic::usub_with_overflow, MVT::i64, 1},
929 {Intrinsic::smul_with_overflow, MVT::i8, 5},
930 {Intrinsic::umul_with_overflow, MVT::i8, 4},
931 {Intrinsic::smul_with_overflow, MVT::i16, 5},
932 {Intrinsic::umul_with_overflow, MVT::i16, 4},
933 {Intrinsic::smul_with_overflow, MVT::i32, 2},
934 {Intrinsic::umul_with_overflow, MVT::i32, 2},
935 {Intrinsic::smul_with_overflow, MVT::i64, 3},
936 {Intrinsic::umul_with_overflow, MVT::i64, 3},
938 EVT MTy = TLI->getValueType(
DL, RetTy->getContainedType(0),
true);
945 case Intrinsic::fptosi_sat:
946 case Intrinsic::fptoui_sat: {
949 bool IsSigned = ICA.
getID() == Intrinsic::fptosi_sat;
951 EVT MTy = TLI->getValueType(
DL, RetTy);
954 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
955 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
956 LT.second == MVT::v2f64)) {
958 (LT.second == MVT::f64 && MTy == MVT::i32) ||
959 (LT.second == MVT::f32 && MTy == MVT::i64)))
968 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
975 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
976 (LT.second == MVT::f16 && MTy == MVT::i64) ||
977 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
991 if ((LT.second.getScalarType() == MVT::f32 ||
992 LT.second.getScalarType() == MVT::f64 ||
993 LT.second.getScalarType() == MVT::f16) &&
997 if (LT.second.isVector())
1002 LegalTy, {LegalTy, LegalTy});
1006 LegalTy, {LegalTy, LegalTy});
1008 return LT.first *
Cost +
1009 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
1015 RetTy = RetTy->getScalarType();
1016 if (LT.second.isVector()) {
1034 return LT.first *
Cost;
1036 case Intrinsic::fshl:
1037 case Intrinsic::fshr: {
1046 if (RetTy->isIntegerTy() && ICA.
getArgs()[0] == ICA.
getArgs()[1] &&
1047 (RetTy->getPrimitiveSizeInBits() == 32 ||
1048 RetTy->getPrimitiveSizeInBits() == 64)) {
1061 {Intrinsic::fshl, MVT::v4i32, 2},
1062 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
1063 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
1064 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
1070 return LegalisationCost.first * Entry->Cost;
1074 if (!RetTy->isIntegerTy())
1079 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
1080 RetTy->getScalarSizeInBits() < 64) ||
1081 (RetTy->getScalarSizeInBits() % 64 != 0);
1082 unsigned ExtraCost = HigherCost ? 1 : 0;
1083 if (RetTy->getScalarSizeInBits() == 32 ||
1084 RetTy->getScalarSizeInBits() == 64)
1087 else if (HigherCost)
1091 return TyL.first + ExtraCost;
1093 case Intrinsic::get_active_lane_mask: {
1095 EVT RetVT = getTLI()->getValueType(
DL, RetTy);
1097 if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
1100 if (RetTy->isScalableTy()) {
1101 if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
1111 if (ST->hasSVE2p1() || ST->hasSME2()) {
1126 return Cost + (SplitCost * (
Cost - 1));
1141 case Intrinsic::experimental_vector_match: {
1144 unsigned SearchSize = NeedleTy->getNumElements();
1145 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
1158 case Intrinsic::cttz: {
1160 if (LT.second == MVT::v8i8 || LT.second == MVT::v16i8)
1161 return LT.first * 2;
1162 if (LT.second == MVT::v4i16 || LT.second == MVT::v8i16 ||
1163 LT.second == MVT::v2i32 || LT.second == MVT::v4i32)
1164 return LT.first * 3;
1167 case Intrinsic::experimental_cttz_elts: {
1169 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1177 case Intrinsic::loop_dependence_raw_mask:
1178 case Intrinsic::loop_dependence_war_mask: {
1180 if (ST->hasSVE2() || ST->hasSME()) {
1181 EVT VecVT = getTLI()->getValueType(
DL, RetTy);
1182 unsigned EltSizeInBytes =
1192 case Intrinsic::experimental_vector_extract_last_active:
1193 if (ST->isSVEorStreamingSVEAvailable()) {
1199 case Intrinsic::pow: {
1202 EVT VT = getTLI()->getValueType(
DL, RetTy);
1204 bool HasLibcall = getTLI()->getLibcallImpl(LC) != RTLIB::Unsupported;
1219 bool Is025 = ExpF->getValueAPF().isExactlyValue(0.25);
1220 bool Is075 = ExpF->getValueAPF().isExactlyValue(0.75);
1230 return (Sqrt * 2) +
FMul;
1241 case Intrinsic::sqrt:
1242 case Intrinsic::fabs:
1243 case Intrinsic::ceil:
1244 case Intrinsic::floor:
1245 case Intrinsic::nearbyint:
1246 case Intrinsic::round:
1247 case Intrinsic::rint:
1248 case Intrinsic::roundeven:
1249 case Intrinsic::trunc:
1250 case Intrinsic::minnum:
1251 case Intrinsic::maxnum:
1252 case Intrinsic::minimum:
1253 case Intrinsic::maximum: {
1271 auto RequiredType =
II.getType();
1274 assert(PN &&
"Expected Phi Node!");
1277 if (!PN->hasOneUse())
1278 return std::nullopt;
1280 for (
Value *IncValPhi : PN->incoming_values()) {
1283 Reinterpret->getIntrinsicID() !=
1284 Intrinsic::aarch64_sve_convert_to_svbool ||
1285 RequiredType != Reinterpret->getArgOperand(0)->getType())
1286 return std::nullopt;
1294 for (
unsigned I = 0;
I < PN->getNumIncomingValues();
I++) {
1296 NPN->
addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(
I));
1369 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1374 return GoverningPredicateIdx;
1379 GoverningPredicateIdx = Index;
1397 return UndefIntrinsic;
1402 UndefIntrinsic = IID;
1424 return ResultLanes == InactiveLanesTakenFromOperand;
1429 return OperandIdxForInactiveLanes;
1433 assert(ResultLanes == Uninitialized &&
"Cannot set property twice!");
1434 ResultLanes = InactiveLanesTakenFromOperand;
1435 OperandIdxForInactiveLanes = Index;
1440 return ResultLanes == InactiveLanesAreNotDefined;
1444 assert(ResultLanes == Uninitialized &&
"Cannot set property twice!");
1445 ResultLanes = InactiveLanesAreNotDefined;
1450 return ResultLanes == InactiveLanesAreUnused;
1454 assert(ResultLanes == Uninitialized &&
"Cannot set property twice!");
1455 ResultLanes = InactiveLanesAreUnused;
1465 ResultIsZeroInitialized =
true;
1476 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1481 return OperandIdxWithNoActiveLanes;
1486 OperandIdxWithNoActiveLanes = Index;
1491 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1494 unsigned IROpcode = 0;
1496 enum PredicationStyle {
1498 InactiveLanesTakenFromOperand,
1499 InactiveLanesAreNotDefined,
1500 InactiveLanesAreUnused
1503 bool ResultIsZeroInitialized =
false;
1504 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1505 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1513 return !isa<ScalableVectorType>(V->getType());
1521 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1522 case Intrinsic::aarch64_sve_fcvt_f16f32:
1523 case Intrinsic::aarch64_sve_fcvt_f16f64:
1524 case Intrinsic::aarch64_sve_fcvt_f32f16:
1525 case Intrinsic::aarch64_sve_fcvt_f32f64:
1526 case Intrinsic::aarch64_sve_fcvt_f64f16:
1527 case Intrinsic::aarch64_sve_fcvt_f64f32:
1528 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1529 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1530 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1531 case Intrinsic::aarch64_sve_fcvtzs:
1532 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1533 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1534 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1535 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1536 case Intrinsic::aarch64_sve_fcvtzu:
1537 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1538 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1539 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1540 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1541 case Intrinsic::aarch64_sve_revb:
1542 case Intrinsic::aarch64_sve_revh:
1543 case Intrinsic::aarch64_sve_revw:
1544 case Intrinsic::aarch64_sve_revd:
1545 case Intrinsic::aarch64_sve_scvtf:
1546 case Intrinsic::aarch64_sve_scvtf_f16i32:
1547 case Intrinsic::aarch64_sve_scvtf_f16i64:
1548 case Intrinsic::aarch64_sve_scvtf_f32i64:
1549 case Intrinsic::aarch64_sve_scvtf_f64i32:
1550 case Intrinsic::aarch64_sve_ucvtf:
1551 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1552 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1553 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1554 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1557 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1558 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1559 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1560 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1563 case Intrinsic::aarch64_sve_fabd:
1565 case Intrinsic::aarch64_sve_fadd:
1568 case Intrinsic::aarch64_sve_fdiv:
1571 case Intrinsic::aarch64_sve_fmax:
1573 case Intrinsic::aarch64_sve_fmaxnm:
1575 case Intrinsic::aarch64_sve_fmin:
1577 case Intrinsic::aarch64_sve_fminnm:
1579 case Intrinsic::aarch64_sve_fmla:
1581 case Intrinsic::aarch64_sve_fmls:
1583 case Intrinsic::aarch64_sve_fmul:
1586 case Intrinsic::aarch64_sve_fmulx:
1588 case Intrinsic::aarch64_sve_fnmla:
1590 case Intrinsic::aarch64_sve_fnmls:
1592 case Intrinsic::aarch64_sve_fsub:
1595 case Intrinsic::aarch64_sve_add:
1598 case Intrinsic::aarch64_sve_mla:
1600 case Intrinsic::aarch64_sve_mls:
1602 case Intrinsic::aarch64_sve_mul:
1605 case Intrinsic::aarch64_sve_sabd:
1607 case Intrinsic::aarch64_sve_sdiv:
1610 case Intrinsic::aarch64_sve_smax:
1612 case Intrinsic::aarch64_sve_smin:
1614 case Intrinsic::aarch64_sve_smulh:
1616 case Intrinsic::aarch64_sve_sub:
1619 case Intrinsic::aarch64_sve_uabd:
1621 case Intrinsic::aarch64_sve_udiv:
1624 case Intrinsic::aarch64_sve_umax:
1626 case Intrinsic::aarch64_sve_umin:
1628 case Intrinsic::aarch64_sve_umulh:
1630 case Intrinsic::aarch64_sve_asr:
1633 case Intrinsic::aarch64_sve_lsl:
1636 case Intrinsic::aarch64_sve_lsr:
1639 case Intrinsic::aarch64_sve_and:
1642 case Intrinsic::aarch64_sve_bic:
1644 case Intrinsic::aarch64_sve_eor:
1647 case Intrinsic::aarch64_sve_orr:
1650 case Intrinsic::aarch64_sve_shsub:
1652 case Intrinsic::aarch64_sve_shsubr:
1654 case Intrinsic::aarch64_sve_sqrshl:
1656 case Intrinsic::aarch64_sve_sqshl:
1658 case Intrinsic::aarch64_sve_sqsub:
1660 case Intrinsic::aarch64_sve_srshl:
1662 case Intrinsic::aarch64_sve_uhsub:
1664 case Intrinsic::aarch64_sve_uhsubr:
1666 case Intrinsic::aarch64_sve_uqrshl:
1668 case Intrinsic::aarch64_sve_uqshl:
1670 case Intrinsic::aarch64_sve_uqsub:
1672 case Intrinsic::aarch64_sve_urshl:
1675 case Intrinsic::aarch64_sve_add_u:
1678 case Intrinsic::aarch64_sve_and_u:
1681 case Intrinsic::aarch64_sve_asr_u:
1684 case Intrinsic::aarch64_sve_eor_u:
1687 case Intrinsic::aarch64_sve_fadd_u:
1690 case Intrinsic::aarch64_sve_fdiv_u:
1693 case Intrinsic::aarch64_sve_fmul_u:
1696 case Intrinsic::aarch64_sve_fsub_u:
1699 case Intrinsic::aarch64_sve_lsl_u:
1702 case Intrinsic::aarch64_sve_lsr_u:
1705 case Intrinsic::aarch64_sve_mul_u:
1708 case Intrinsic::aarch64_sve_orr_u:
1711 case Intrinsic::aarch64_sve_sdiv_u:
1714 case Intrinsic::aarch64_sve_sub_u:
1717 case Intrinsic::aarch64_sve_udiv_u:
1721 case Intrinsic::aarch64_sve_addqv:
1722 case Intrinsic::aarch64_sve_and_z:
1723 case Intrinsic::aarch64_sve_bic_z:
1724 case Intrinsic::aarch64_sve_brka_z:
1725 case Intrinsic::aarch64_sve_brkb_z:
1726 case Intrinsic::aarch64_sve_brkn_z:
1727 case Intrinsic::aarch64_sve_brkpa_z:
1728 case Intrinsic::aarch64_sve_brkpb_z:
1729 case Intrinsic::aarch64_sve_cntp:
1730 case Intrinsic::aarch64_sve_compact:
1731 case Intrinsic::aarch64_sve_eor_z:
1732 case Intrinsic::aarch64_sve_eorv:
1733 case Intrinsic::aarch64_sve_eorqv:
1734 case Intrinsic::aarch64_sve_nand_z:
1735 case Intrinsic::aarch64_sve_nor_z:
1736 case Intrinsic::aarch64_sve_orn_z:
1737 case Intrinsic::aarch64_sve_orr_z:
1738 case Intrinsic::aarch64_sve_orv:
1739 case Intrinsic::aarch64_sve_orqv:
1740 case Intrinsic::aarch64_sve_pnext:
1741 case Intrinsic::aarch64_sve_rdffr_z:
1742 case Intrinsic::aarch64_sve_saddv:
1743 case Intrinsic::aarch64_sve_uaddv:
1744 case Intrinsic::aarch64_sve_umaxv:
1745 case Intrinsic::aarch64_sve_umaxqv:
1746 case Intrinsic::aarch64_sve_cmpeq:
1747 case Intrinsic::aarch64_sve_cmpeq_wide:
1748 case Intrinsic::aarch64_sve_cmpge:
1749 case Intrinsic::aarch64_sve_cmpge_wide:
1750 case Intrinsic::aarch64_sve_cmpgt:
1751 case Intrinsic::aarch64_sve_cmpgt_wide:
1752 case Intrinsic::aarch64_sve_cmphi:
1753 case Intrinsic::aarch64_sve_cmphi_wide:
1754 case Intrinsic::aarch64_sve_cmphs:
1755 case Intrinsic::aarch64_sve_cmphs_wide:
1756 case Intrinsic::aarch64_sve_cmple_wide:
1757 case Intrinsic::aarch64_sve_cmplo_wide:
1758 case Intrinsic::aarch64_sve_cmpls_wide:
1759 case Intrinsic::aarch64_sve_cmplt_wide:
1760 case Intrinsic::aarch64_sve_cmpne:
1761 case Intrinsic::aarch64_sve_cmpne_wide:
1762 case Intrinsic::aarch64_sve_facge:
1763 case Intrinsic::aarch64_sve_facgt:
1764 case Intrinsic::aarch64_sve_fcmpeq:
1765 case Intrinsic::aarch64_sve_fcmpge:
1766 case Intrinsic::aarch64_sve_fcmpgt:
1767 case Intrinsic::aarch64_sve_fcmpne:
1768 case Intrinsic::aarch64_sve_fcmpuo:
1769 case Intrinsic::aarch64_sve_ld1:
1770 case Intrinsic::aarch64_sve_ld1_gather:
1771 case Intrinsic::aarch64_sve_ld1_gather_index:
1772 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1773 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1774 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1775 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1776 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1777 case Intrinsic::aarch64_sve_ld1q_gather_index:
1778 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1779 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1780 case Intrinsic::aarch64_sve_ld1ro:
1781 case Intrinsic::aarch64_sve_ld1rq:
1782 case Intrinsic::aarch64_sve_ld1udq:
1783 case Intrinsic::aarch64_sve_ld1uwq:
1784 case Intrinsic::aarch64_sve_ld2_sret:
1785 case Intrinsic::aarch64_sve_ld2q_sret:
1786 case Intrinsic::aarch64_sve_ld3_sret:
1787 case Intrinsic::aarch64_sve_ld3q_sret:
1788 case Intrinsic::aarch64_sve_ld4_sret:
1789 case Intrinsic::aarch64_sve_ld4q_sret:
1790 case Intrinsic::aarch64_sve_ldff1:
1791 case Intrinsic::aarch64_sve_ldff1_gather:
1792 case Intrinsic::aarch64_sve_ldff1_gather_index:
1793 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1794 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1795 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1796 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1797 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1798 case Intrinsic::aarch64_sve_ldnf1:
1799 case Intrinsic::aarch64_sve_ldnt1:
1800 case Intrinsic::aarch64_sve_ldnt1_gather:
1801 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1802 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1803 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1806 case Intrinsic::aarch64_sve_prf:
1807 case Intrinsic::aarch64_sve_prfb_gather_index:
1808 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1809 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1810 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1811 case Intrinsic::aarch64_sve_prfd_gather_index:
1812 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1813 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1814 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1815 case Intrinsic::aarch64_sve_prfh_gather_index:
1816 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1817 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1818 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1819 case Intrinsic::aarch64_sve_prfw_gather_index:
1820 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1821 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1822 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1825 case Intrinsic::aarch64_sve_st1_scatter:
1826 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1827 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1828 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1829 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1830 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1831 case Intrinsic::aarch64_sve_st1dq:
1832 case Intrinsic::aarch64_sve_st1q_scatter_index:
1833 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1834 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1835 case Intrinsic::aarch64_sve_st1wq:
1836 case Intrinsic::aarch64_sve_stnt1:
1837 case Intrinsic::aarch64_sve_stnt1_scatter:
1838 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1839 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1840 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1842 case Intrinsic::aarch64_sve_st2:
1843 case Intrinsic::aarch64_sve_st2q:
1845 case Intrinsic::aarch64_sve_st3:
1846 case Intrinsic::aarch64_sve_st3q:
1848 case Intrinsic::aarch64_sve_st4:
1849 case Intrinsic::aarch64_sve_st4q:
1857 Value *UncastedPred;
1863 Pred = UncastedPred;
1869 if (OrigPredTy->getMinNumElements() <=
1871 ->getMinNumElements())
1872 Pred = UncastedPred;
1876 return C &&
C->isAllOnesValue();
1883 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1884 Dup->getOperand(1) == Pg &&
isa<Constant>(Dup->getOperand(2)))
1892static std::optional<Instruction *>
1899 Value *Op1 =
II.getOperand(1);
1900 Value *Op2 =
II.getOperand(2);
1926 return std::nullopt;
1934 if (SimpleII == Inactive)
1944static std::optional<Instruction *>
1948 return std::nullopt;
1977 II.setCalledFunction(NewDecl);
1987 return std::nullopt;
1999static std::optional<Instruction *>
2003 return std::nullopt;
2005 auto IntrinsicID = BinOp->getIntrinsicID();
2006 switch (IntrinsicID) {
2007 case Intrinsic::aarch64_sve_and_z:
2008 case Intrinsic::aarch64_sve_bic_z:
2009 case Intrinsic::aarch64_sve_eor_z:
2010 case Intrinsic::aarch64_sve_nand_z:
2011 case Intrinsic::aarch64_sve_nor_z:
2012 case Intrinsic::aarch64_sve_orn_z:
2013 case Intrinsic::aarch64_sve_orr_z:
2016 return std::nullopt;
2019 auto BinOpPred = BinOp->getOperand(0);
2020 auto BinOpOp1 = BinOp->getOperand(1);
2021 auto BinOpOp2 = BinOp->getOperand(2);
2025 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
2026 return std::nullopt;
2028 auto PredOp = PredIntr->getOperand(0);
2030 if (PredOpTy !=
II.getType())
2031 return std::nullopt;
2035 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
2036 NarrowedBinOpArgs.
push_back(NarrowBinOpOp1);
2037 if (BinOpOp1 == BinOpOp2)
2038 NarrowedBinOpArgs.
push_back(NarrowBinOpOp1);
2041 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
2043 auto NarrowedBinOp =
2048static std::optional<Instruction *>
2055 return BinOpCombine;
2060 return std::nullopt;
2063 Value *Cursor =
II.getOperand(0), *EarliestReplacement =
nullptr;
2072 if (CursorVTy->getElementCount().getKnownMinValue() <
2073 IVTy->getElementCount().getKnownMinValue())
2077 if (Cursor->getType() == IVTy)
2078 EarliestReplacement = Cursor;
2083 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
2084 Intrinsic::aarch64_sve_convert_to_svbool ||
2085 IntrinsicCursor->getIntrinsicID() ==
2086 Intrinsic::aarch64_sve_convert_from_svbool))
2089 CandidatesForRemoval.
insert(CandidatesForRemoval.
begin(), IntrinsicCursor);
2090 Cursor = IntrinsicCursor->getOperand(0);
2095 if (!EarliestReplacement)
2096 return std::nullopt;
2104 auto *OpPredicate =
II.getOperand(0);
2121 II.getArgOperand(2));
2127 return std::nullopt;
2131 II.getArgOperand(0),
II.getArgOperand(2),
uint64_t(0));
2140 II.getArgOperand(0));
2150 return std::nullopt;
2155 if (!SplatValue || !SplatValue->isZero())
2156 return std::nullopt;
2161 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
2162 return std::nullopt;
2166 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
2167 return std::nullopt;
2170 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
2171 return std::nullopt;
2176 return std::nullopt;
2179 return std::nullopt;
2183 return std::nullopt;
2187 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
2188 return std::nullopt;
2190 unsigned NumElts = VecTy->getNumElements();
2191 unsigned PredicateBits = 0;
2194 for (
unsigned I = 0;
I < NumElts; ++
I) {
2197 return std::nullopt;
2199 PredicateBits |= 1 << (
I * (16 / NumElts));
2203 if (PredicateBits == 0) {
2205 PFalse->takeName(&
II);
2211 for (
unsigned I = 0;
I < 16; ++
I)
2212 if ((PredicateBits & (1 <<
I)) != 0)
2215 unsigned PredSize = Mask & -Mask;
2220 for (
unsigned I = 0;
I < 16;
I += PredSize)
2221 if ((PredicateBits & (1 <<
I)) == 0)
2222 return std::nullopt;
2224 auto *ConvertToSVBool =
2227 auto *ConvertFromSVBool =
2229 II.getType(), ConvertToSVBool);
2237 Value *Pg =
II.getArgOperand(0);
2238 Value *Vec =
II.getArgOperand(1);
2239 auto IntrinsicID =
II.getIntrinsicID();
2240 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2252 auto OpC = OldBinOp->getOpcode();
2258 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(),
II.getIterator());
2264 if (IsAfter &&
C &&
C->isNullValue()) {
2268 Extract->insertBefore(
II.getIterator());
2269 Extract->takeName(&
II);
2275 return std::nullopt;
2277 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2278 return std::nullopt;
2280 const auto PTruePattern =
2286 return std::nullopt;
2288 unsigned Idx = MinNumElts - 1;
2298 if (Idx >= PgVTy->getMinNumElements())
2299 return std::nullopt;
2304 Extract->insertBefore(
II.getIterator());
2305 Extract->takeName(&
II);
2318 Value *Pg =
II.getArgOperand(0);
2320 Value *Vec =
II.getArgOperand(2);
2323 if (!Ty->isIntegerTy())
2324 return std::nullopt;
2329 return std::nullopt;
2346 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2361static std::optional<Instruction *>
2365 if (
Pattern == AArch64SVEPredPattern::all) {
2374 return MinNumElts && NumElts >= MinNumElts
2376 II, ConstantInt::get(
II.getType(), MinNumElts)))
2380static std::optional<Instruction *>
2383 if (!ST->isStreaming())
2384 return std::nullopt;
2396 Value *PgVal =
II.getArgOperand(0);
2397 Value *OpVal =
II.getArgOperand(1);
2401 if (PgVal == OpVal &&
2402 (
II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2403 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2418 return std::nullopt;
2422 if (Pg->
getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2423 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2437 if ((Pg ==
Op) && (
II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2438 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2439 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2440 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2441 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2442 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2443 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2444 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2445 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2446 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2447 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2448 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2449 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2459 return std::nullopt;
2462template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2463static std::optional<Instruction *>
2465 bool MergeIntoAddendOp) {
2467 Value *MulOp0, *MulOp1, *AddendOp, *
Mul;
2468 if (MergeIntoAddendOp) {
2469 AddendOp =
II.getOperand(1);
2470 Mul =
II.getOperand(2);
2472 AddendOp =
II.getOperand(2);
2473 Mul =
II.getOperand(1);
2478 return std::nullopt;
2480 if (!
Mul->hasOneUse())
2481 return std::nullopt;
2484 if (
II.getType()->isFPOrFPVectorTy()) {
2489 return std::nullopt;
2491 return std::nullopt;
2496 if (MergeIntoAddendOp)
2506static std::optional<Instruction *>
2508 Value *Pred =
II.getOperand(0);
2509 Value *PtrOp =
II.getOperand(1);
2510 Type *VecTy =
II.getType();
2514 Load->copyMetadata(
II);
2525static std::optional<Instruction *>
2527 Value *VecOp =
II.getOperand(0);
2528 Value *Pred =
II.getOperand(1);
2529 Value *PtrOp =
II.getOperand(2);
2533 Store->copyMetadata(
II);
2545 case Intrinsic::aarch64_sve_fmul_u:
2546 return Instruction::BinaryOps::FMul;
2547 case Intrinsic::aarch64_sve_fadd_u:
2548 return Instruction::BinaryOps::FAdd;
2549 case Intrinsic::aarch64_sve_fsub_u:
2550 return Instruction::BinaryOps::FSub;
2552 return Instruction::BinaryOpsEnd;
2556static std::optional<Instruction *>
2559 if (
II.isStrictFP())
2560 return std::nullopt;
2562 auto *OpPredicate =
II.getOperand(0);
2564 if (BinOpCode == Instruction::BinaryOpsEnd ||
2566 return std::nullopt;
2568 BinOpCode,
II.getOperand(1),
II.getOperand(2),
II.getFastMathFlags());
2572static std::optional<Instruction *>
2574 assert(
II.getIntrinsicID() == Intrinsic::aarch64_sve_mla_u &&
2575 "Expected MLA_U intrinsic");
2576 Value *Acc =
II.getArgOperand(1);
2577 Value *MulOp0 =
II.getArgOperand(2);
2578 Value *MulOp1 =
II.getArgOperand(3);
2593 II.setArgOperand(2, MulOp1);
2594 II.setArgOperand(3, MulOp0);
2598 return std::nullopt;
2604 Intrinsic::aarch64_sve_mla>(
2608 Intrinsic::aarch64_sve_mad>(
2611 return std::nullopt;
2614static std::optional<Instruction *>
2618 Intrinsic::aarch64_sve_fmla>(IC,
II,
2623 Intrinsic::aarch64_sve_fmad>(IC,
II,
2628 Intrinsic::aarch64_sve_fmla>(IC,
II,
2631 return std::nullopt;
2634static std::optional<Instruction *>
2638 Intrinsic::aarch64_sve_fmla>(IC,
II,
2643 Intrinsic::aarch64_sve_fmad>(IC,
II,
2648 Intrinsic::aarch64_sve_fmla_u>(
2654static std::optional<Instruction *>
2658 Intrinsic::aarch64_sve_fmls>(IC,
II,
2663 Intrinsic::aarch64_sve_fnmsb>(
2668 Intrinsic::aarch64_sve_fmls>(IC,
II,
2671 return std::nullopt;
2674static std::optional<Instruction *>
2678 Intrinsic::aarch64_sve_fmls>(IC,
II,
2683 Intrinsic::aarch64_sve_fnmsb>(
2688 Intrinsic::aarch64_sve_fmls_u>(
2697 Intrinsic::aarch64_sve_mls>(
2700 return std::nullopt;
2705 Value *UnpackArg =
II.getArgOperand(0);
2707 bool IsSigned =
II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2708 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2721 return std::nullopt;
2725 auto *OpVal =
II.getOperand(0);
2726 auto *OpIndices =
II.getOperand(1);
2733 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2734 return std::nullopt;
2749 Type *RetTy =
II.getType();
2750 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2751 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2755 if ((
match(
II.getArgOperand(0),
2762 if (TyA ==
B->getType() &&
2767 TyA->getMinNumElements());
2773 return std::nullopt;
2781 if (
match(
II.getArgOperand(0),
2786 II, (
II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ?
A :
B));
2788 return std::nullopt;
2791static std::optional<Instruction *>
2793 Value *Mask =
II.getOperand(0);
2794 Value *BasePtr =
II.getOperand(1);
2795 Value *Index =
II.getOperand(2);
2806 BasePtr->getPointerAlignment(
II.getDataLayout());
2809 BasePtr, IndexBase);
2816 return std::nullopt;
2819static std::optional<Instruction *>
2821 Value *Val =
II.getOperand(0);
2822 Value *Mask =
II.getOperand(1);
2823 Value *BasePtr =
II.getOperand(2);
2824 Value *Index =
II.getOperand(3);
2834 BasePtr->getPointerAlignment(
II.getDataLayout());
2837 BasePtr, IndexBase);
2843 return std::nullopt;
2849 Value *Pred =
II.getOperand(0);
2850 Value *Vec =
II.getOperand(1);
2851 Value *DivVec =
II.getOperand(2);
2855 if (!SplatConstantInt)
2856 return std::nullopt;
2860 if (DivisorValue == -1)
2861 return std::nullopt;
2862 if (DivisorValue == 1)
2868 Intrinsic::aarch64_sve_asrd, {
II.getType()}, {Pred, Vec, DivisorLog2});
2875 Intrinsic::aarch64_sve_asrd, {
II.getType()}, {Pred, Vec, DivisorLog2});
2877 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2881 return std::nullopt;
2885 size_t VecSize = Vec.
size();
2890 size_t HalfVecSize = VecSize / 2;
2894 if (*
LHS !=
nullptr && *
RHS !=
nullptr) {
2902 if (*
LHS ==
nullptr && *
RHS !=
nullptr)
2920 return std::nullopt;
2927 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2928 CurrentInsertElt = InsertElt->getOperand(0);
2934 return std::nullopt;
2938 for (
size_t I = 0;
I < Elts.
size();
I++) {
2939 if (Elts[
I] ==
nullptr)
2944 if (InsertEltChain ==
nullptr)
2945 return std::nullopt;
2951 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.
size();
2952 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2953 IIScalableTy->getMinNumElements() /
2958 auto *WideShuffleMaskTy =
2969 auto NarrowBitcast =
2982 return std::nullopt;
2987 Value *Pred =
II.getOperand(0);
2988 Value *Vec =
II.getOperand(1);
2989 Value *Shift =
II.getOperand(2);
2992 Value *AbsPred, *MergedValue;
2998 return std::nullopt;
3006 return std::nullopt;
3011 return std::nullopt;
3014 {
II.getType()}, {Pred, Vec, Shift});
3021 Value *Vec =
II.getOperand(0);
3026 return std::nullopt;
3032 auto *NI =
II.getNextNode();
3035 return !
I->mayReadOrWriteMemory() && !
I->mayHaveSideEffects();
3037 while (LookaheadThreshold-- && CanSkipOver(NI)) {
3038 auto *NIBB = NI->getParent();
3039 NI = NI->getNextNode();
3041 if (
auto *SuccBB = NIBB->getUniqueSuccessor())
3042 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
3048 if (NextII &&
II.isIdenticalTo(NextII))
3051 return std::nullopt;
3059 {II.getType(), II.getOperand(0)->getType()},
3060 {II.getOperand(0), II.getOperand(1)}));
3067 if (PredPattern == AArch64SVEPredPattern::all ||
3068 PredPattern == AArch64SVEPredPattern::pow2)
3070 return std::nullopt;
3076 Value *Passthru =
II.getOperand(0);
3084 auto *Mask = ConstantInt::get(Ty, MaskValue);
3090 return std::nullopt;
3093static std::optional<Instruction *>
3100 return std::nullopt;
3103std::optional<Instruction *>
3114 case Intrinsic::aarch64_dmb:
3116 case Intrinsic::aarch64_neon_fmaxnm:
3117 case Intrinsic::aarch64_neon_fminnm:
3119 case Intrinsic::aarch64_sve_convert_from_svbool:
3121 case Intrinsic::aarch64_sve_dup:
3123 case Intrinsic::aarch64_sve_dup_x:
3125 case Intrinsic::aarch64_sve_cmpne:
3126 case Intrinsic::aarch64_sve_cmpne_wide:
3128 case Intrinsic::aarch64_sve_rdffr:
3130 case Intrinsic::aarch64_sve_lasta:
3131 case Intrinsic::aarch64_sve_lastb:
3133 case Intrinsic::aarch64_sve_clasta_n:
3134 case Intrinsic::aarch64_sve_clastb_n:
3136 case Intrinsic::aarch64_sve_cntd:
3138 case Intrinsic::aarch64_sve_cntw:
3140 case Intrinsic::aarch64_sve_cnth:
3142 case Intrinsic::aarch64_sve_cntb:
3144 case Intrinsic::aarch64_sme_cntsd:
3146 case Intrinsic::aarch64_sve_ptest_any:
3147 case Intrinsic::aarch64_sve_ptest_first:
3148 case Intrinsic::aarch64_sve_ptest_last:
3150 case Intrinsic::aarch64_sve_fadd:
3152 case Intrinsic::aarch64_sve_fadd_u:
3154 case Intrinsic::aarch64_sve_fmul_u:
3156 case Intrinsic::aarch64_sve_fsub:
3158 case Intrinsic::aarch64_sve_fsub_u:
3160 case Intrinsic::aarch64_sve_add:
3162 case Intrinsic::aarch64_sve_add_u:
3164 Intrinsic::aarch64_sve_mla_u>(
3166 case Intrinsic::aarch64_sve_mla_u:
3168 case Intrinsic::aarch64_sve_sub:
3170 case Intrinsic::aarch64_sve_sub_u:
3172 Intrinsic::aarch64_sve_mls_u>(
3174 case Intrinsic::aarch64_sve_tbl:
3176 case Intrinsic::aarch64_sve_uunpkhi:
3177 case Intrinsic::aarch64_sve_uunpklo:
3178 case Intrinsic::aarch64_sve_sunpkhi:
3179 case Intrinsic::aarch64_sve_sunpklo:
3181 case Intrinsic::aarch64_sve_uzp1:
3183 case Intrinsic::aarch64_sve_zip1:
3184 case Intrinsic::aarch64_sve_zip2:
3186 case Intrinsic::aarch64_sve_ld1_gather_index:
3188 case Intrinsic::aarch64_sve_st1_scatter_index:
3190 case Intrinsic::aarch64_sve_ld1:
3192 case Intrinsic::aarch64_sve_st1:
3194 case Intrinsic::aarch64_sve_sdiv:
3196 case Intrinsic::aarch64_sve_sel:
3198 case Intrinsic::aarch64_sve_srshl:
3200 case Intrinsic::aarch64_sve_dupq_lane:
3202 case Intrinsic::aarch64_sve_insr:
3204 case Intrinsic::aarch64_sve_whilelo:
3206 case Intrinsic::aarch64_sve_ptrue:
3208 case Intrinsic::aarch64_sve_uxtb:
3210 case Intrinsic::aarch64_sve_uxth:
3212 case Intrinsic::aarch64_sve_uxtw:
3214 case Intrinsic::aarch64_sme_in_streaming_mode:
3218 return std::nullopt;
3225 SimplifyAndSetOp)
const {
3226 switch (
II.getIntrinsicID()) {
3229 case Intrinsic::aarch64_neon_fcvtxn:
3230 case Intrinsic::aarch64_neon_rshrn:
3231 case Intrinsic::aarch64_neon_sqrshrn:
3232 case Intrinsic::aarch64_neon_sqrshrun:
3233 case Intrinsic::aarch64_neon_sqshrn:
3234 case Intrinsic::aarch64_neon_sqshrun:
3235 case Intrinsic::aarch64_neon_sqxtn:
3236 case Intrinsic::aarch64_neon_sqxtun:
3237 case Intrinsic::aarch64_neon_uqrshrn:
3238 case Intrinsic::aarch64_neon_uqshrn:
3239 case Intrinsic::aarch64_neon_uqxtn:
3240 SimplifyAndSetOp(&
II, 0, OrigDemandedElts, UndefElts);
3244 return std::nullopt;
3248 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3258 if (ST->useSVEForFixedLengthVectors() &&
3261 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
3262 else if (ST->isNeonAvailable())
3267 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3276bool AArch64TTIImpl::isSingleExtWideningInstruction(
3278 Type *SrcOverrideTy)
const {
3293 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3296 Type *SrcTy = SrcOverrideTy;
3298 case Instruction::Add:
3299 case Instruction::Sub: {
3308 if (Opcode == Instruction::Sub)
3332 assert(SrcTy &&
"Expected some SrcTy");
3334 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3340 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3342 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3346 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3349Type *AArch64TTIImpl::isBinExtWideningInstruction(
unsigned Opcode,
Type *DstTy,
3351 Type *SrcOverrideTy)
const {
3352 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3353 Opcode != Instruction::Mul)
3363 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3366 auto getScalarSizeWithOverride = [&](
const Value *
V) {
3372 ->getScalarSizeInBits();
3375 unsigned MaxEltSize = 0;
3378 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3379 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3380 MaxEltSize = std::max(EltSize0, EltSize1);
3383 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3384 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3387 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3389 MaxEltSize = DstEltSize / 2;
3390 }
else if (Opcode == Instruction::Mul &&
3403 getScalarSizeWithOverride(
isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
3407 if (MaxEltSize * 2 > DstEltSize)
3425 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(
DL, Src)) ||
3426 (Src->isScalableTy() && !ST->hasSVE2()))
3436 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3440 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3444 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3445 Src->getScalarSizeInBits() !=
3469 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3473 if (
I &&
I->hasOneUser()) {
3476 if (
Type *ExtTy = isBinExtWideningInstruction(
3477 SingleUser->getOpcode(), Dst, Operands,
3478 Src !=
I->getOperand(0)->getType() ? Src :
nullptr)) {
3491 if (isSingleExtWideningInstruction(
3492 SingleUser->getOpcode(), Dst, Operands,
3493 Src !=
I->getOperand(0)->getType() ? Src :
nullptr)) {
3497 if (SingleUser->getOpcode() == Instruction::Add) {
3498 if (
I == SingleUser->getOperand(1) ||
3500 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3515 EVT SrcTy = TLI->getValueType(
DL, Src);
3516 EVT DstTy = TLI->getValueType(
DL, Dst);
3518 if (!SrcTy.isSimple() || !DstTy.
isSimple())
3523 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3552 EVT WiderTy = SrcTy.
bitsGT(DstTy) ? SrcTy : DstTy;
3555 ST->useSVEForFixedLengthVectors(WiderTy)) {
3556 std::pair<InstructionCost, MVT> LT =
3558 unsigned NumElements =
3574 const unsigned int SVE_EXT_COST = 1;
3575 const unsigned int SVE_FCVT_COST = 1;
3576 const unsigned int SVE_UNPACK_ONCE = 4;
3577 const unsigned int SVE_UNPACK_TWICE = 16;
3706 SVE_EXT_COST + SVE_FCVT_COST},
3711 SVE_EXT_COST + SVE_FCVT_COST},
3718 SVE_EXT_COST + SVE_FCVT_COST},
3722 SVE_EXT_COST + SVE_FCVT_COST},
3728 SVE_EXT_COST + SVE_FCVT_COST},
3731 SVE_EXT_COST + SVE_FCVT_COST},
3736 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3738 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3748 SVE_EXT_COST + SVE_FCVT_COST},
3753 SVE_EXT_COST + SVE_FCVT_COST},
3766 SVE_EXT_COST + SVE_FCVT_COST},
3770 SVE_EXT_COST + SVE_FCVT_COST},
3782 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3784 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3786 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3788 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3792 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3794 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3810 SVE_EXT_COST + SVE_FCVT_COST},
3815 SVE_EXT_COST + SVE_FCVT_COST},
3826 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3828 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3830 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3832 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3834 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3836 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3840 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3842 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3844 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3846 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
4071 if (ST->hasFullFP16())
4083 Src->getScalarType(), CCH,
CostKind) +
4091 ST->isSVEorStreamingSVEAvailable() &&
4092 TLI->getTypeAction(Src->getContext(), SrcTy) ==
4094 TLI->getTypeAction(Dst->getContext(), DstTy) ==
4103 Opcode, LegalTy, Src, CCH,
CostKind,
I);
4106 return Part1 + Part2;
4113 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
4125 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
4138 CostKind, Index,
nullptr,
nullptr);
4142 auto DstVT = TLI->getValueType(
DL, Dst);
4143 auto SrcVT = TLI->getValueType(
DL, Src);
4148 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
4154 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
4164 case Instruction::SExt:
4169 case Instruction::ZExt:
4170 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
4183 return Opcode == Instruction::PHI ? 0 : 1;
4192 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4201 if (!LT.second.isVector())
4206 if (LT.second.isFixedLengthVector()) {
4207 unsigned Width = LT.second.getVectorNumElements();
4208 Index = Index % Width;
4223 if (ST->hasFastLD1Single())
4235 : ST->getVectorInsertExtractBaseCost() + 1;
4259 auto ExtractCanFuseWithFmul = [&]() {
4266 auto IsAllowedScalarTy = [&](
const Type *
T) {
4267 return T->isFloatTy() ||
T->isDoubleTy() ||
4268 (
T->isHalfTy() && ST->hasFullFP16());
4272 auto IsUserFMulScalarTy = [](
const Value *EEUser) {
4275 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4276 !BO->getType()->isVectorTy();
4281 auto IsExtractLaneEquivalentToZero = [&](
unsigned Idx,
unsigned EltSz) {
4285 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4294 DenseMap<User *, unsigned> UserToExtractIdx;
4295 for (
auto *U :
Scalar->users()) {
4296 if (!IsUserFMulScalarTy(U))
4300 UserToExtractIdx[
U];
4302 if (UserToExtractIdx.
empty())
4304 for (
auto &[S, U, L] : ScalarUserAndIdx) {
4305 for (
auto *U : S->users()) {
4306 if (UserToExtractIdx.
contains(U)) {
4308 auto *Op0 =
FMul->getOperand(0);
4309 auto *Op1 =
FMul->getOperand(1);
4310 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4311 UserToExtractIdx[
U] =
L;
4317 for (
auto &[U, L] : UserToExtractIdx) {
4329 return !EE->users().empty() &&
all_of(EE->users(), [&](
const User *U) {
4330 if (!IsUserFMulScalarTy(U))
4335 const auto *BO = cast<BinaryOperator>(U);
4336 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4337 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
4339 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
4342 return IsExtractLaneEquivalentToZero(
4343 cast<ConstantInt>(OtherEE->getIndexOperand())
4346 OtherEE->getType()->getScalarSizeInBits());
4354 if (Opcode == Instruction::ExtractElement && (
I || Scalar) &&
4355 ExtractCanFuseWithFmul())
4360 :
ST->getVectorInsertExtractBaseCost();
4369 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4372 return getVectorInstrCostHelper(Opcode, Val,
CostKind, Index,
nullptr,
4378 Value *Scalar,
ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4380 return getVectorInstrCostHelper(Opcode, Val,
CostKind, Index,
nullptr, Scalar,
4381 ScalarUserAndIdx, VIC);
4388 return getVectorInstrCostHelper(
I.getOpcode(), Val,
CostKind, Index, &
I,
4395 unsigned Index)
const {
4407 : ST->getVectorInsertExtractBaseCost() + 1;
4416 if (Ty->getElementType()->isFloatingPointTy())
4419 unsigned VecInstCost =
4421 return DemandedElts.
popcount() * (Insert + Extract) * VecInstCost;
4428 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4429 return std::nullopt;
4430 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4431 return std::nullopt;
4433 if (CanUseSVE && ST->hasSVEB16B16() && ST->isNonStreamingSVEorSME2Available())
4434 return std::nullopt;
4441 Cost += InstCost(PromotedTy);
4464 Op2Info, Args, CxtI);
4468 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4475 Ty,
CostKind, Op1Info, Op2Info,
true,
4478 [&](
Type *PromotedTy) {
4482 return *PromotedCost;
4485 if (Ty->getScalarType()->isFP128Ty())
4493 if (
Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
4509 if (LT.second == MVT::v2i64) {
4589 auto VT = TLI->getValueType(
DL, Ty);
4590 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4594 : (3 * AsrCost + AddCost);
4596 return MulCost + AsrCost + 2 * AddCost;
4598 }
else if (VT.isVector()) {
4608 if (Ty->isScalableTy() && ST->hasSVE())
4609 Cost += 2 * AsrCost;
4614 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4618 }
else if (LT.second == MVT::v2i64) {
4619 return VT.getVectorNumElements() *
4626 if (Ty->isScalableTy() && ST->hasSVE())
4627 return MulCost + 2 * AddCost + 2 * AsrCost;
4628 return 2 * MulCost + AddCost + AsrCost + UsraCost;
4633 LT.second.isFixedLengthVector()) {
4643 return ExtractCost + InsertCost +
4651 auto VT = TLI->getValueType(
DL, Ty);
4667 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4668 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4669 LT.second == MVT::nxv16i8;
4670 bool Is128bit = LT.second.is128BitVector();
4682 (HasMULH ? 0 : ShrCost) +
4683 AddCost * 2 + ShrCost;
4684 return DivCost + (
ISD ==
ISD::UREM ? MulCost + AddCost : 0);
4691 if (!VT.isVector() && VT.getSizeInBits() > 64)
4695 Opcode, Ty,
CostKind, Op1Info, Op2Info);
4697 if (TLI->isOperationLegalOrCustom(
ISD, LT.second) && ST->hasSVE()) {
4701 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4711 if (
nullptr != Entry)
4716 if (LT.second.getScalarType() == MVT::i8)
4718 else if (LT.second.getScalarType() == MVT::i16)
4730 Opcode, Ty->getScalarType(),
CostKind, Op1Info, Op2Info);
4731 return (4 + DivCost) * VTy->getNumElements();
4737 -1,
nullptr,
nullptr);
4760 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4761 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4770 if (!Ty->getScalarType()->isFP128Ty())
4777 if (!Ty->getScalarType()->isFP128Ty())
4778 return 2 * LT.first;
4785 if (!Ty->isVectorTy())
4801 int MaxMergeDistance = 64;
4805 return NumVectorInstToHideOverhead;
4815 unsigned Opcode1,
unsigned Opcode2)
const {
4818 if (!
Sched.hasInstrSchedModel())
4822 Sched.getSchedClassDesc(
TII->get(Opcode1).getSchedClass());
4824 Sched.getSchedClassDesc(
TII->get(Opcode2).getSchedClass());
4830 "Cannot handle variant scheduling classes without an MI");
4846 const int AmortizationCost = 20;
4854 VecPred = CurrentPred;
4862 static const auto ValidMinMaxTys = {
4863 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4864 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4865 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4869 (ST->hasFullFP16() &&
4875 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4876 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4877 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4878 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4879 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4880 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4881 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4882 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4883 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4884 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4885 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4887 EVT SelCondTy = TLI->getValueType(
DL, CondTy);
4888 EVT SelValTy = TLI->getValueType(
DL, ValTy);
4897 if (Opcode == Instruction::FCmp) {
4899 ValTy,
CostKind, Op1Info, Op2Info,
false,
4901 false, [&](
Type *PromotedTy) {
4913 return *PromotedCost;
4917 if (LT.second.getScalarType() != MVT::f64 &&
4918 LT.second.getScalarType() != MVT::f32 &&
4919 LT.second.getScalarType() != MVT::f16)
4924 unsigned Factor = 1;
4925 if (!CondTy->isVectorTy() &&
4939 AArch64::FCMEQv4f32))
4951 TLI->isTypeLegal(TLI->getValueType(
DL, ValTy)) &&
4970 Op1Info, Op2Info,
I);
4976 if (ST->requiresStrictAlign()) {
4981 Options.AllowOverlappingLoads =
true;
4982 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
4987 Options.LoadSizes = {8, 4, 2, 1};
4988 Options.AllowedTailExpansions = {3, 5, 6};
4993 return ST->hasSVE();
4999 switch (MICA.
getID()) {
5000 case Intrinsic::masked_scatter:
5001 case Intrinsic::masked_gather:
5003 case Intrinsic::masked_load:
5004 case Intrinsic::masked_expandload:
5005 case Intrinsic::masked_store:
5019 if (!LT.first.isValid())
5024 if (VT->getElementType()->isIntegerTy(1))
5035 if (MICA.
getID() == Intrinsic::masked_expandload) {
5051 if (LT.first > 1 && LT.second.getScalarSizeInBits() > 8)
5052 return MemOpCost * 2;
5061 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5062 "Should be called on only load or stores.");
5064 case Instruction::Load:
5067 return ST->getGatherOverhead();
5069 case Instruction::Store:
5072 return ST->getScatterOverhead();
5083 unsigned Opcode = (MICA.
getID() == Intrinsic::masked_gather ||
5084 MICA.
getID() == Intrinsic::vp_gather)
5086 : Instruction::Store;
5096 if (!LT.first.isValid())
5100 if (!LT.second.isVector() ||
5102 VT->getElementType()->isIntegerTy(1))
5112 ElementCount LegalVF = LT.second.getVectorElementCount();
5115 {TTI::OK_AnyValue, TTI::OP_None},
I);
5131 EVT VT = TLI->getValueType(
DL, Ty,
true);
5133 if (VT == MVT::Other)
5138 if (!LT.first.isValid())
5148 (VTy->getElementType()->isIntegerTy(1) &&
5149 !VTy->getElementCount().isKnownMultipleOf(
5159 if (Opcode == Instruction::Store)
5163 if (ST->getFixedLoadLatency())
5164 return (LT.first - 1) + ST->getFixedLoadLatency();
5173 if (LT.second.isScalableVector() ||
5174 ST->useSVEForFixedLengthVectors(LT.second)) {
5175 Inst = AArch64::LDR_ZXI;
5176 }
else if (LT.second.isVector() || LT.second.isFloatingPoint()) {
5177 switch (LT.second.getSizeInBits()) {
5179 Inst = AArch64::LDRBui;
5182 Inst = AArch64::LDRHui;
5185 Inst = AArch64::LDRSui;
5188 Inst = AArch64::LDRDui;
5191 Inst = AArch64::LDRQui;
5197 switch (LT.second.getSizeInBits()) {
5199 Inst = AArch64::LDRBBui;
5202 Inst = AArch64::LDRHHui;
5205 Inst = AArch64::LDRWui;
5208 Inst = AArch64::LDRXui;
5216 unsigned SchedClass =
TII->get(Inst).getSchedClass();
5220 float NumLoads = (LT.first - 1).
getValue();
5221 return NumLoads *
Sched.getReciprocalThroughput(*ST, *SCD) +
5222 Sched.computeInstrLatency(*ST, *SCD);
5225 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
5226 LT.second.is128BitVector() && Alignment <
Align(16)) {
5232 const int AmortizationCost = 6;
5234 return LT.first * 2 * AmortizationCost;
5238 if (Ty->isPtrOrPtrVectorTy())
5243 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
5245 if (VT == MVT::v4i8)
5252 if (!
isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
5267 while (!TypeWorklist.
empty()) {
5289 bool UseMaskForCond,
bool UseMaskForGaps)
const {
5290 assert(Factor >= 2 &&
"Invalid interleave factor");
5305 if (!VecTy->
isScalableTy() && (UseMaskForCond || UseMaskForGaps))
5308 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
5309 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
5312 VecVTy->getElementCount().divideCoefficientBy(Factor));
5318 if (MinElts % Factor == 0 &&
5319 TLI->isLegalInterleavedAccessType(SubVecTy,
DL, UseScalable))
5320 return Factor * TLI->getNumInterleavedAccesses(SubVecTy,
DL, UseScalable);
5325 UseMaskForCond, UseMaskForGaps);
5332 for (
auto *
I : Tys) {
5333 if (!
I->isVectorTy())
5344 Align Alignment)
const {
5351 return (ST->isSVEAvailable() && ST->hasSVE2p2()) ||
5352 (ST->isSVEorStreamingSVEAvailable() && ST->hasSME2p2());
5357 bool HasUnorderedReductions)
const {
5360 return ST->getMaxInterleaveFactor();
5370 enum { MaxStridedLoads = 7 };
5372 int StridedLoads = 0;
5375 for (
const auto BB : L->blocks()) {
5376 for (
auto &
I : *BB) {
5382 if (L->isLoopInvariant(PtrValue))
5387 if (!LSCEVAddRec || !LSCEVAddRec->
isAffine())
5396 if (StridedLoads > MaxStridedLoads / 2)
5397 return StridedLoads;
5400 return StridedLoads;
5403 int StridedLoads = countStridedLoads(L, SE);
5405 <<
" strided loads\n");
5421 unsigned *FinalSize) {
5425 for (
auto *BB : L->getBlocks()) {
5426 for (
auto &
I : *BB) {
5432 if (!Cost.isValid())
5436 if (LoopCost > Budget)
5458 if (MaxTC > 0 && MaxTC <= 32)
5469 if (Blocks.
size() != 2)
5491 if (!L->isInnermost() || L->getNumBlocks() > 8)
5495 if (!L->getExitBlock())
5501 bool HasParellelizableReductions =
5502 L->getNumBlocks() == 1 &&
5503 any_of(L->getHeader()->phis(),
5505 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
5508 if (HasParellelizableReductions &&
5530 if (HasParellelizableReductions) {
5541 if (Header == Latch) {
5544 unsigned Width = 10;
5550 unsigned MaxInstsPerLine = 16;
5552 unsigned BestUC = 1;
5553 unsigned SizeWithBestUC = BestUC *
Size;
5555 unsigned SizeWithUC = UC *
Size;
5556 if (SizeWithUC > 48)
5558 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5559 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5561 SizeWithBestUC = BestUC *
Size;
5571 for (
auto *BB : L->blocks()) {
5572 for (
auto &
I : *BB) {
5582 for (
auto *U :
I.users())
5584 LoadedValuesPlus.
insert(U);
5591 return LoadedValuesPlus.
contains(
SI->getOperand(0));
5617 auto *I = dyn_cast<Instruction>(V);
5618 return I && DependsOnLoopLoad(I, Depth + 1);
5625 DependsOnLoopLoad(
I, 0)) {
5641 if (L->getLoopDepth() > 1)
5652 for (
auto *BB : L->getBlocks()) {
5653 for (
auto &
I : *BB) {
5657 if (IsVectorized &&
I.getType()->isVectorTy())
5674 if (ST->isAppleMLike())
5676 else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
5698 !ST->getSchedModel().isOutOfOrder()) {
5721 bool CanCreate)
const {
5725 case Intrinsic::aarch64_neon_st1x2:
5726 case Intrinsic::aarch64_neon_st1x3:
5727 case Intrinsic::aarch64_neon_st1x4:
5728 case Intrinsic::aarch64_neon_st2:
5729 case Intrinsic::aarch64_neon_st3:
5730 case Intrinsic::aarch64_neon_st4: {
5733 if (!CanCreate || !ST)
5735 unsigned NumElts = Inst->
arg_size() - 1;
5736 if (ST->getNumElements() != NumElts)
5738 for (
unsigned i = 0, e = NumElts; i != e; ++i) {
5744 for (
unsigned i = 0, e = NumElts; i != e; ++i) {
5746 Res = Builder.CreateInsertValue(Res, L, i);
5750 case Intrinsic::aarch64_neon_ld1x2:
5751 case Intrinsic::aarch64_neon_ld1x3:
5752 case Intrinsic::aarch64_neon_ld1x4:
5753 case Intrinsic::aarch64_neon_ld2:
5754 case Intrinsic::aarch64_neon_ld3:
5755 case Intrinsic::aarch64_neon_ld4:
5756 if (Inst->
getType() == ExpectedType)
5767 case Intrinsic::aarch64_neon_ld1x2:
5768 case Intrinsic::aarch64_neon_ld1x3:
5769 case Intrinsic::aarch64_neon_ld1x4:
5770 case Intrinsic::aarch64_neon_ld2:
5771 case Intrinsic::aarch64_neon_ld3:
5772 case Intrinsic::aarch64_neon_ld4:
5773 Info.ReadMem =
true;
5774 Info.WriteMem =
false;
5777 case Intrinsic::aarch64_neon_st1x2:
5778 case Intrinsic::aarch64_neon_st1x3:
5779 case Intrinsic::aarch64_neon_st1x4:
5780 case Intrinsic::aarch64_neon_st2:
5781 case Intrinsic::aarch64_neon_st3:
5782 case Intrinsic::aarch64_neon_st4:
5783 Info.ReadMem =
false;
5784 Info.WriteMem =
true;
5793 case Intrinsic::aarch64_neon_ld1x2:
5794 case Intrinsic::aarch64_neon_st1x2:
5795 Info.MatchingId = Intrinsic::aarch64_neon_ld1x2;
5797 case Intrinsic::aarch64_neon_ld1x3:
5798 case Intrinsic::aarch64_neon_st1x3:
5799 Info.MatchingId = Intrinsic::aarch64_neon_ld1x3;
5801 case Intrinsic::aarch64_neon_ld1x4:
5802 case Intrinsic::aarch64_neon_st1x4:
5803 Info.MatchingId = Intrinsic::aarch64_neon_ld1x4;
5805 case Intrinsic::aarch64_neon_ld2:
5806 case Intrinsic::aarch64_neon_st2:
5807 Info.MatchingId = Intrinsic::aarch64_neon_ld2;
5809 case Intrinsic::aarch64_neon_ld3:
5810 case Intrinsic::aarch64_neon_st3:
5811 Info.MatchingId = Intrinsic::aarch64_neon_ld3;
5813 case Intrinsic::aarch64_neon_ld4:
5814 case Intrinsic::aarch64_neon_st4:
5815 Info.MatchingId = Intrinsic::aarch64_neon_ld4;
5827 const Instruction &
I,
bool &AllowPromotionWithoutCommonHeader)
const {
5828 bool Considerable =
false;
5829 AllowPromotionWithoutCommonHeader =
false;
5832 Type *ConsideredSExtType =
5834 if (
I.getType() != ConsideredSExtType)
5838 for (
const User *U :
I.users()) {
5840 Considerable =
true;
5844 if (GEPInst->getNumOperands() > 2) {
5845 AllowPromotionWithoutCommonHeader =
true;
5850 return Considerable;
5901 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5911 return LegalizationCost + 2;
5921 LegalizationCost *= LT.first - 1;
5924 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5933 return LegalizationCost + 2;
5941 std::optional<FastMathFlags> FMF,
5957 return BaseCost + FixedVTy->getNumElements();
5974 MVT MTy = LT.second;
5975 int ISD = TLI->InstructionOpcodeToISD(Opcode);
6023 MTy.
isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
6024 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
6036 return (LT.first - 1) +
Log2_32(NElts);
6041 return (LT.first - 1) + Entry->Cost;
6053 if (LT.first != 1) {
6059 ExtraCost *= LT.first - 1;
6062 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
6063 return Cost + ExtraCost;
6071 unsigned Opcode,
bool IsUnsigned,
Type *ResTy,
VectorType *VecTy,
6073 EVT VecVT = TLI->getValueType(
DL, VecTy);
6074 EVT ResVT = TLI->getValueType(
DL, ResTy);
6084 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
6086 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
6088 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
6090 return (LT.first - 1) * 2 + 2;
6101 EVT VecVT = TLI->getValueType(
DL, VecTy);
6102 EVT ResVT = TLI->getValueType(
DL, ResTy);
6105 RedOpcode == Instruction::Add) {
6111 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
6113 return LT.first + 2;
6148 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
6149 ? TLI->getPromotedVTForPredicate(
EVT(LT.second))
6163 if (LT.second.getScalarType() == MVT::i1) {
6172 assert(Entry &&
"Illegal Type for Splice");
6173 LegalizationCost += Entry->Cost;
6174 return LegalizationCost * LT.first;
6178 unsigned Opcode,
Type *InputTypeA,
Type *InputTypeB,
Type *AccumType,
6187 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub &&
6188 Opcode != Instruction::FAdd && Opcode != Instruction::FSub) ||
6195 assert(FMF &&
"Missing FastMathFlags for floating-point partial reduction");
6196 if (!FMF->allowReassoc() || !FMF->allowContract())
6200 "FastMathFlags only apply to floating-point partial reductions");
6204 (!BinOp || (OpBExtend !=
TTI::PR_None && InputTypeB)) &&
6205 "Unexpected values for OpBExtend or InputTypeB");
6209 if (BinOp && ((*BinOp != Instruction::Mul && *BinOp != Instruction::FMul) ||
6210 InputTypeA != InputTypeB))
6213 bool IsUSDot = OpBExtend !=
TTI::PR_None && OpAExtend != OpBExtend;
6216 if (IsUSDot && !ST->hasMatMulInt8() && !ST->hasDotProd())
6229 auto TC = TLI->getTypeConversion(AccumVectorType->
getContext(),
6238 if (TLI->getTypeAction(AccumVectorType->
getContext(), TC.second) !=
6244 std::pair<InstructionCost, MVT> AccumLT =
6246 std::pair<InstructionCost, MVT> InputLT =
6250 auto IsSupported = [&](
bool SVEPred,
bool NEONPred) ->
bool {
6251 return (ST->isSVEorStreamingSVEAvailable() && SVEPred) ||
6252 (AccumLT.second.isFixedLengthVector() &&
6253 AccumLT.second.getSizeInBits() <= 128 && ST->isNeonAvailable() &&
6257 bool IsSub = Opcode == Instruction::Sub || Opcode == Instruction::FSub;
6265 if (AccumLT.second.getScalarType() == MVT::i32 &&
6266 InputLT.second.getScalarType() == MVT::i8) {
6268 if (!IsUSDot && IsSupported(
true, ST->hasDotProd()))
6269 return Cost + INegCost;
6271 if (IsUSDot && IsSupported(ST->hasMatMulInt8(), ST->hasMatMulInt8()))
6272 return Cost + INegCost;
6277 if (IsUSDot && IsSupported(
false, ST->hasDotProd()))
6278 return Cost * 3 + INegCost;
6281 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
6283 if (AccumLT.second.getScalarType() == MVT::i64 &&
6284 InputLT.second.getScalarType() == MVT::i16)
6285 return Cost + INegCost;
6288 if (AccumLT.second.getScalarType() == MVT::i32 &&
6289 InputLT.second.getScalarType() == MVT::i16 &&
6290 (ST->hasSVE2p1() || ST->hasSME2()) && !IsSub)
6293 if (AccumLT.second.getScalarType() == MVT::i64 &&
6294 InputLT.second.getScalarType() == MVT::i8)
6300 return Cost + INegCost;
6303 if (AccumLT.second.getScalarType() == MVT::i16 &&
6304 InputLT.second.getScalarType() == MVT::i8 &&
6305 (ST->hasSVE2p3() || ST->hasSME2p3()) && !IsSub)
6311 if (Opcode == Instruction::FAdd && !IsSub &&
6312 IsSupported(ST->hasSME2() || ST->hasSVE2p1(), ST->hasF16F32DOT()) &&
6313 AccumLT.second.getScalarType() == MVT::f32 &&
6314 InputLT.second.getScalarType() == MVT::f16)
6318 if (Ratio == 2 && !IsUSDot) {
6319 MVT InVT = InputLT.second.getScalarType();
6322 if (IsSupported(ST->hasSVE2() || ST->hasSME(),
true) &&
6327 if (IsSupported(ST->hasSVE2(), ST->hasFP16FML()) && InVT == MVT::f16)
6331 if (IsSupported(ST->hasSVE2p1() || ST->hasSME2(),
false) &&
6332 InVT == MVT::bf16 && IsSub)
6342 if (IsSupported(ST->hasBF16(), ST->hasBF16()) && InVT == MVT::bf16)
6343 return Cost * 2 + FNegCost;
6347 AccumType, VF, OpAExtend, OpBExtend,
6359 "Expected the Mask to match the return size if given");
6361 "Expected the same scalar types");
6367 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
6368 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
6369 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
6377 return std::max<InstructionCost>(1, LT.first / 4);
6385 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
6387 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
6390 unsigned TpNumElts = Mask.size();
6391 unsigned LTNumElts = LT.second.getVectorNumElements();
6392 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
6394 LT.second.getVectorElementCount());
6396 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>,
InstructionCost>
6398 for (
unsigned N = 0;
N < NumVecs;
N++) {
6402 unsigned Source1 = -1U, Source2 = -1U;
6403 unsigned NumSources = 0;
6404 for (
unsigned E = 0; E < LTNumElts; E++) {
6405 int MaskElt = (
N * LTNumElts + E < TpNumElts) ? Mask[
N * LTNumElts + E]
6414 unsigned Source = MaskElt / LTNumElts;
6415 if (NumSources == 0) {
6418 }
else if (NumSources == 1 && Source != Source1) {
6421 }
else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
6427 if (Source == Source1)
6429 else if (Source == Source2)
6430 NMask.
push_back(MaskElt % LTNumElts + LTNumElts);
6439 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
6450 NTp, NTp, NMask,
CostKind, 0,
nullptr, Args,
6453 Result.first->second = NCost;
6467 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6468 if (LT.second.getFixedSizeInBits() >= 128 &&
6470 LT.second.getVectorNumElements() / 2) {
6473 if (Index == (
int)LT.second.getVectorNumElements() / 2)
6487 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6490 return M.value() < 0 || M.value() == (int)M.index();
6496 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6497 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6506 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6507 ST->isSVEorStreamingSVEAvailable() &&
6512 if (ST->isSVEorStreamingSVEAvailable() &&
6526 if (IsLoad && LT.second.isVector() &&
6528 LT.second.getVectorElementCount()))
6534 if (Mask.size() == 4 &&
6536 (SrcTy->getScalarSizeInBits() == 16 ||
6537 SrcTy->getScalarSizeInBits() == 32) &&
6538 all_of(Mask, [](
int E) {
return E < 8; }))
6544 if (LT.second.isFixedLengthVector() &&
6545 LT.second.getVectorNumElements() == Mask.size() &&
6551 (
isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6552 isTRNMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6553 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6554 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6555 LT.second.getVectorNumElements(), 16) ||
6556 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6557 LT.second.getVectorNumElements(), 32) ||
6558 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6559 LT.second.getVectorNumElements(), 64) ||
6562 [&Mask](
int M) {
return M < 0 || M == Mask[0]; })))
6691 return LT.first * Entry->Cost;
6700 LT.second.getSizeInBits() <= 128 && SubTp) {
6702 if (SubLT.second.isVector()) {
6703 int NumElts = LT.second.getVectorNumElements();
6704 int NumSubElts = SubLT.second.getVectorNumElements();
6705 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6711 if (IsExtractSubvector)
6728 if (
getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
6747 return ST->useFixedOverScalableIfEqualCost();
6751 return ST->getEpilogueVectorizationMinVF();
6786 unsigned NumInsns = 0;
6788 NumInsns += BB->size();
6798 int64_t Scale,
unsigned AddrSpace)
const {
6826 if (
I->getOpcode() == Instruction::Or &&
6830 if (
I->getOpcode() == Instruction::Add ||
6831 I->getOpcode() == Instruction::Sub)
6856 return all_equal(Shuf->getShuffleMask());
6863 bool AllowSplat =
false) {
6868 auto areTypesHalfed = [](
Value *FullV,
Value *HalfV) {
6869 auto *FullTy = FullV->
getType();
6870 auto *HalfTy = HalfV->getType();
6872 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6875 auto extractHalf = [](
Value *FullV,
Value *HalfV) {
6878 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6882 Value *S1Op1 =
nullptr, *S2Op1 =
nullptr;
6896 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6897 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6911 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6912 (M2Start != 0 && M2Start != (NumElements / 2)))
6914 if (S1Op1 && S2Op1 && M1Start != M2Start)
6924 return Ext->getType()->getScalarSizeInBits() ==
6925 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6939 Value *VectorOperand =
nullptr;
6956 if (!
GEP ||
GEP->getNumOperands() != 2)
6960 Value *Offsets =
GEP->getOperand(1);
6963 if (
Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6969 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6970 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6971 Ops.push_back(&
GEP->getOperandUse(1));
7007 switch (
II->getIntrinsicID()) {
7008 case Intrinsic::aarch64_neon_smull:
7009 case Intrinsic::aarch64_neon_umull:
7012 Ops.push_back(&
II->getOperandUse(0));
7013 Ops.push_back(&
II->getOperandUse(1));
7018 case Intrinsic::fma:
7019 case Intrinsic::fmuladd:
7026 Ops.push_back(&
II->getOperandUse(0));
7028 Ops.push_back(&
II->getOperandUse(1));
7031 case Intrinsic::aarch64_neon_sqdmull:
7032 case Intrinsic::aarch64_neon_sqdmulh:
7033 case Intrinsic::aarch64_neon_sqrdmulh:
7036 Ops.push_back(&
II->getOperandUse(0));
7038 Ops.push_back(&
II->getOperandUse(1));
7039 return !
Ops.empty();
7040 case Intrinsic::aarch64_neon_fmlal:
7041 case Intrinsic::aarch64_neon_fmlal2:
7042 case Intrinsic::aarch64_neon_fmlsl:
7043 case Intrinsic::aarch64_neon_fmlsl2:
7046 Ops.push_back(&
II->getOperandUse(1));
7048 Ops.push_back(&
II->getOperandUse(2));
7049 return !
Ops.empty();
7050 case Intrinsic::aarch64_sve_ptest_first:
7051 case Intrinsic::aarch64_sve_ptest_last:
7053 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
7054 Ops.push_back(&
II->getOperandUse(0));
7055 return !
Ops.empty();
7056 case Intrinsic::aarch64_sme_write_horiz:
7057 case Intrinsic::aarch64_sme_write_vert:
7058 case Intrinsic::aarch64_sme_writeq_horiz:
7059 case Intrinsic::aarch64_sme_writeq_vert: {
7061 if (!Idx || Idx->getOpcode() != Instruction::Add)
7063 Ops.push_back(&
II->getOperandUse(1));
7066 case Intrinsic::aarch64_sme_read_horiz:
7067 case Intrinsic::aarch64_sme_read_vert:
7068 case Intrinsic::aarch64_sme_readq_horiz:
7069 case Intrinsic::aarch64_sme_readq_vert:
7070 case Intrinsic::aarch64_sme_ld1b_vert:
7071 case Intrinsic::aarch64_sme_ld1h_vert:
7072 case Intrinsic::aarch64_sme_ld1w_vert:
7073 case Intrinsic::aarch64_sme_ld1d_vert:
7074 case Intrinsic::aarch64_sme_ld1q_vert:
7075 case Intrinsic::aarch64_sme_st1b_vert:
7076 case Intrinsic::aarch64_sme_st1h_vert:
7077 case Intrinsic::aarch64_sme_st1w_vert:
7078 case Intrinsic::aarch64_sme_st1d_vert:
7079 case Intrinsic::aarch64_sme_st1q_vert:
7080 case Intrinsic::aarch64_sme_ld1b_horiz:
7081 case Intrinsic::aarch64_sme_ld1h_horiz:
7082 case Intrinsic::aarch64_sme_ld1w_horiz:
7083 case Intrinsic::aarch64_sme_ld1d_horiz:
7084 case Intrinsic::aarch64_sme_ld1q_horiz:
7085 case Intrinsic::aarch64_sme_st1b_horiz:
7086 case Intrinsic::aarch64_sme_st1h_horiz:
7087 case Intrinsic::aarch64_sme_st1w_horiz:
7088 case Intrinsic::aarch64_sme_st1d_horiz:
7089 case Intrinsic::aarch64_sme_st1q_horiz: {
7091 if (!Idx || Idx->getOpcode() != Instruction::Add)
7093 Ops.push_back(&
II->getOperandUse(3));
7096 case Intrinsic::aarch64_neon_pmull:
7099 Ops.push_back(&
II->getOperandUse(0));
7100 Ops.push_back(&
II->getOperandUse(1));
7102 case Intrinsic::aarch64_neon_pmull64:
7104 II->getArgOperand(1)))
7106 Ops.push_back(&
II->getArgOperandUse(0));
7107 Ops.push_back(&
II->getArgOperandUse(1));
7109 case Intrinsic::masked_gather:
7112 Ops.push_back(&
II->getArgOperandUse(0));
7114 case Intrinsic::masked_scatter:
7117 Ops.push_back(&
II->getArgOperandUse(1));
7124 auto ShouldSinkCondition = [](
Value *
Cond,
7129 if (
II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
7133 Ops.push_back(&
II->getOperandUse(0));
7137 switch (
I->getOpcode()) {
7138 case Instruction::GetElementPtr:
7139 case Instruction::Add:
7140 case Instruction::Sub:
7142 for (
unsigned Op = 0;
Op <
I->getNumOperands(); ++
Op) {
7144 Ops.push_back(&
I->getOperandUse(
Op));
7149 case Instruction::Select: {
7150 if (!ShouldSinkCondition(
I->getOperand(0),
Ops))
7153 Ops.push_back(&
I->getOperandUse(0));
7156 case Instruction::UncondBr:
7158 case Instruction::CondBr: {
7162 Ops.push_back(&
I->getOperandUse(0));
7165 case Instruction::FMul:
7170 Ops.push_back(&
I->getOperandUse(0));
7172 Ops.push_back(&
I->getOperandUse(1));
7182 case Instruction::Xor:
7185 if (
I->getType()->isVectorTy() && ST->isNeonAvailable()) {
7187 ST->isSVEorStreamingSVEAvailable() && (ST->hasSVE2() || ST->hasSME());
7192 case Instruction::And:
7193 case Instruction::Or:
7196 if (
I->getOpcode() == Instruction::Or &&
7201 if (!(
I->getType()->isVectorTy() && ST->hasNEON()) &&
7204 for (
auto &
Op :
I->operands()) {
7216 Ops.push_back(&Not);
7217 Ops.push_back(&InsertElt);
7227 if (!
I->getType()->isVectorTy())
7228 return !
Ops.empty();
7230 switch (
I->getOpcode()) {
7231 case Instruction::Sub:
7232 case Instruction::Add: {
7241 Ops.push_back(&Ext1->getOperandUse(0));
7242 Ops.push_back(&Ext2->getOperandUse(0));
7245 Ops.push_back(&
I->getOperandUse(0));
7246 Ops.push_back(&
I->getOperandUse(1));
7250 case Instruction::Or: {
7253 if (ST->hasNEON()) {
7267 if (
I->getParent() != MainAnd->
getParent() ||
7272 if (
I->getParent() != IA->getParent() ||
7273 I->getParent() != IB->getParent())
7278 Ops.push_back(&
I->getOperandUse(0));
7279 Ops.push_back(&
I->getOperandUse(1));
7288 case Instruction::Mul: {
7289 auto ShouldSinkSplatForIndexedVariant = [](
Value *V) {
7292 if (Ty->isScalableTy())
7296 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
7299 int NumZExts = 0, NumSExts = 0;
7300 for (
auto &
Op :
I->operands()) {
7307 auto *ExtOp = Ext->getOperand(0);
7308 if (
isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
7309 Ops.push_back(&Ext->getOperandUse(0));
7317 if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
7318 I->getType()->getScalarSizeInBits())
7355 if (!ElementConstant || !ElementConstant->
isZero())
7358 unsigned Opcode = OperandInstr->
getOpcode();
7359 if (Opcode == Instruction::SExt)
7361 else if (Opcode == Instruction::ZExt)
7366 unsigned Bitwidth =
I->getType()->getScalarSizeInBits();
7376 Ops.push_back(&Insert->getOperandUse(1));
7382 if (!
Ops.empty() && (NumSExts == 2 || NumZExts == 2))
7386 if (!ShouldSinkSplatForIndexedVariant(
I))
7391 Ops.push_back(&
I->getOperandUse(0));
7393 Ops.push_back(&
I->getOperandUse(1));
7395 return !
Ops.empty();
7397 case Instruction::FMul: {
7399 if (
I->getType()->isScalableTy())
7400 return !
Ops.empty();
7404 return !
Ops.empty();
7408 Ops.push_back(&
I->getOperandUse(0));
7410 Ops.push_back(&
I->getOperandUse(1));
7411 return !
Ops.empty();
static bool isAllActivePredicate(const SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
static Value * getCondition(Instruction *I)
const HexagonInstrInfo * TII
This file provides the interface for the instcombine pass implementation.
static constexpr Value * getValue(Ty &ValueOrUse)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
This file defines the LoopVectorizationLegality class.
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
static SymbolRef::Type getType(const Symbol *Sym)
This file describes how to lower LLVM code to machine code.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool preferTailFoldingOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
APInt getPriorityMask(const Function &F) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
InstructionCost getBranchMispredictPenalty() const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const override
bool enableScalableVectorization() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
unsigned getMaxInterleaveFactor(ElementCount VF, bool HasUnorderedReductions) const override
Class for arbitrary precision integers.
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
unsigned popcount() const
Count the number of bits set.
unsigned countLeadingOnes() const
void negate()
Negate this APInt in place.
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
unsigned logBase2() const
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
int64_t getSExtValue() const
Get sign extended value.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
Get the array size.
LLVM Basic Block Representation.
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
@ ICMP_SLT
signed less than
@ ICMP_SLE
signed less or equal
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
@ ICMP_UGT
unsigned greater than
@ ICMP_SGT
signed greater than
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
@ ICMP_SGE
signed greater or equal
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
static bool isIntPredicate(Predicate P)
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
const APInt & getValue() const
Return the constant as an APInt value reference.
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
LLVM_ABI Constant * getSplatValue(bool AllowPoison=false) const
If all elements of the vector constant have the same value, return that value.
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
static constexpr ElementCount getScalable(ScalarTy MinVal)
static constexpr ElementCount getFixed(ScalarTy MinVal)
constexpr bool isScalar() const
Exactly one element.
This provides a helper for copying FMF from an instruction or setting specified flags.
Convenience struct for specifying and reasoning about fast-math flags.
bool noSignedZeros() const
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
LLVM_ABI Value * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={}, function_ref< void(CallInst *)> SetFn=[](CallInst *) {})
Variant to create a possibly constant-folded intrinsic.
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
This instruction inserts a single (scalar) element into a VectorType value.
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
LLVM_ABI FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
Type * getReturnType() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
Intrinsic::ID getID() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
DominatorTree * getDominatorTree() const
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
static MVT getScalableVectorVT(MVT VT, unsigned NumElements)
bool isFixedLengthVector() const
MVT getVectorElementType() const
Information for memory intrinsic cost model.
Align getAlignment() const
Type * getDataType() const
Intrinsic::ID getID() const
const Instruction * getInst() const
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresSMChange() const
bool requiresLazySave() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
static StackOffset getScalable(int64_t Scalable)
static StackOffset getFixed(int64_t Fixed)
An instruction for storing to memory.
Represent a constant reference to a string, i.e.
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
static constexpr TypeSize getFixed(ScalarTy ExactSize)
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
bool isVectorTy() const
True if this is an instance of VectorType.
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
bool isPointerTy() const
True if this is an instance of PointerType.
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isIntegerTy() const
True if this is an instance of IntegerType.
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
const ParentTy * getParent() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
LLVM_ABI APInt getCpuSupportsMask(ArrayRef< StringRef > Features)
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
@ ADD
Simple integer binary arithmetic operators.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ FADD
Simple binary floating point operators.
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ SIGN_EXTEND
Conversion operators.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
match_bind< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
brc_match< Cond_t, match_bind< BasicBlock >, match_bind< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
LLVM_ABI Libcall getPOW(EVT RetVT)
getPOW - Return the POW_* value for the given types, or UNKNOWN_LIBCALL if there is none.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0,...
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
auto dyn_cast_or_null(const Y &Val)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ FSub
Subtraction of floats.
@ FAddChainWithSubs
A chain of fadds and fsubs.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool ShouldCheckWrap=true, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0,...
unsigned getMatchingIROpode() const
bool inactiveLanesAreUnused() const
bool inactiveLanesAreNotDefined() const
bool hasMatchingUndefIntrinsic() const
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
bool hasGoverningPredicate() const
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
bool inactiveLanesTakenFromOperand() const
static SVEIntrinsicInfo defaultUndefOp()
bool hasOperandWithNoActiveLanes() const
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
bool hasMatchingIROpode() const
bool resultIsZeroInitialized() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
bool isFixedLengthVector() const
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Machine model for scheduling, bundling, and heuristics.
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...