21#include "llvm/IR/IntrinsicsAArch64.h"
31#define DEBUG_TYPE "aarch64tti"
52 "Penalty of calling a function that requires a change to PSTATE.SM"));
56 cl::desc(
"Penalty of inlining a call that requires a change to PSTATE.SM"));
62class TailFoldingOption {
77 bool NeedsDefault =
true;
81 void setNeedsDefault(
bool V) { NeedsDefault =
V; }
96 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
97 "Initial bits should only include one of "
98 "(disabled|all|simple|default)");
99 Bits = NeedsDefault ? DefaultBits : InitialBits;
101 Bits &= ~DisableBits;
107 errs() <<
"invalid argument '" << Opt
108 <<
"' to -sve-tail-folding=; the option should be of the form\n"
109 " (disabled|all|default|simple)[+(reductions|recurrences"
110 "|reverse|noreductions|norecurrences|noreverse)]\n";
116 void operator=(
const std::string &Val) {
125 setNeedsDefault(
false);
130 unsigned StartIdx = 1;
131 if (TailFoldTypes[0] ==
"disabled")
132 setInitialBits(TailFoldingOpts::Disabled);
133 else if (TailFoldTypes[0] ==
"all")
134 setInitialBits(TailFoldingOpts::All);
135 else if (TailFoldTypes[0] ==
"default")
136 setNeedsDefault(
true);
137 else if (TailFoldTypes[0] ==
"simple")
138 setInitialBits(TailFoldingOpts::Simple);
141 setInitialBits(TailFoldingOpts::Disabled);
144 for (
unsigned I = StartIdx;
I < TailFoldTypes.
size();
I++) {
145 if (TailFoldTypes[
I] ==
"reductions")
146 setEnableBit(TailFoldingOpts::Reductions);
147 else if (TailFoldTypes[
I] ==
"recurrences")
148 setEnableBit(TailFoldingOpts::Recurrences);
149 else if (TailFoldTypes[
I] ==
"reverse")
150 setEnableBit(TailFoldingOpts::Reverse);
151 else if (TailFoldTypes[
I] ==
"noreductions")
152 setDisableBit(TailFoldingOpts::Reductions);
153 else if (TailFoldTypes[
I] ==
"norecurrences")
154 setDisableBit(TailFoldingOpts::Recurrences);
155 else if (TailFoldTypes[
I] ==
"noreverse")
156 setDisableBit(TailFoldingOpts::Reverse);
173 "Control the use of vectorisation using tail-folding for SVE where the"
174 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
175 "\ndisabled (Initial) No loop types will vectorize using "
177 "\ndefault (Initial) Uses the default tail-folding settings for "
179 "\nall (Initial) All legal loop types will vectorize using "
181 "\nsimple (Initial) Use tail-folding for simple loops (not "
182 "reductions or recurrences)"
183 "\nreductions Use tail-folding for loops containing reductions"
184 "\nnoreductions Inverse of above"
185 "\nrecurrences Use tail-folding for loops containing fixed order "
187 "\nnorecurrences Inverse of above"
188 "\nreverse Use tail-folding for loops requiring reversed "
190 "\nnoreverse Inverse of above"),
208 .
Case(
"__arm_sme_state",
true)
209 .
Case(
"__arm_tpidr2_save",
true)
210 .
Case(
"__arm_tpidr2_restore",
true)
211 .
Case(
"__arm_za_disable",
true)
225 if (isa<CallInst>(
I) && !
I.isDebugOrPseudoInst() &&
226 (cast<CallInst>(
I).isInlineAsm() || isa<IntrinsicInst>(
I) ||
236 SMEAttrs CallerAttrs(*Caller), CalleeAttrs(*Callee);
248 if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
249 CallerAttrs.requiresSMChange(CalleeAttrs)) {
257 TM.getSubtargetImpl(*Caller)->getFeatureBits();
259 TM.getSubtargetImpl(*Callee)->getFeatureBits();
263 return (CallerBits & CalleeBits) == CalleeBits;
281 auto FVTy = dyn_cast<FixedVectorType>(Ty);
283 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
292 unsigned DefaultCallPenalty)
const {
315 if (
F == Call.getCaller())
321 return DefaultCallPenalty;
360 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
365 for (
unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
371 return std::max<InstructionCost>(1,
Cost);
386 unsigned ImmIdx = ~0U;
390 case Instruction::GetElementPtr:
395 case Instruction::Store:
398 case Instruction::Add:
399 case Instruction::Sub:
400 case Instruction::Mul:
401 case Instruction::UDiv:
402 case Instruction::SDiv:
403 case Instruction::URem:
404 case Instruction::SRem:
405 case Instruction::And:
406 case Instruction::Or:
407 case Instruction::Xor:
408 case Instruction::ICmp:
412 case Instruction::Shl:
413 case Instruction::LShr:
414 case Instruction::AShr:
418 case Instruction::Trunc:
419 case Instruction::ZExt:
420 case Instruction::SExt:
421 case Instruction::IntToPtr:
422 case Instruction::PtrToInt:
423 case Instruction::BitCast:
424 case Instruction::PHI:
425 case Instruction::Call:
426 case Instruction::Select:
427 case Instruction::Ret:
428 case Instruction::Load:
433 int NumConstants = (BitSize + 63) / 64;
457 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
463 case Intrinsic::sadd_with_overflow:
464 case Intrinsic::uadd_with_overflow:
465 case Intrinsic::ssub_with_overflow:
466 case Intrinsic::usub_with_overflow:
467 case Intrinsic::smul_with_overflow:
468 case Intrinsic::umul_with_overflow:
470 int NumConstants = (BitSize + 63) / 64;
477 case Intrinsic::experimental_stackmap:
478 if ((
Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
481 case Intrinsic::experimental_patchpoint_void:
482 case Intrinsic::experimental_patchpoint:
483 if ((
Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
486 case Intrinsic::experimental_gc_statepoint:
487 if ((
Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
497 if (TyWidth == 32 || TyWidth == 64)
512 switch (ICA.
getID()) {
513 case Intrinsic::umin:
514 case Intrinsic::umax:
515 case Intrinsic::smin:
516 case Intrinsic::smax: {
517 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
518 MVT::v8i16, MVT::v2i32, MVT::v4i32,
519 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
523 if (LT.second == MVT::v2i64)
525 if (
any_of(ValidMinMaxTys, [<](
MVT M) {
return M == LT.second; }))
529 case Intrinsic::sadd_sat:
530 case Intrinsic::ssub_sat:
531 case Intrinsic::uadd_sat:
532 case Intrinsic::usub_sat: {
533 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
534 MVT::v8i16, MVT::v2i32, MVT::v4i32,
540 LT.second.getScalarSizeInBits() ==
RetTy->getScalarSizeInBits() ? 1 : 4;
541 if (
any_of(ValidSatTys, [<](
MVT M) {
return M == LT.second; }))
542 return LT.first * Instrs;
545 case Intrinsic::abs: {
546 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
547 MVT::v8i16, MVT::v2i32, MVT::v4i32,
550 if (
any_of(ValidAbsTys, [<](
MVT M) {
return M == LT.second; }))
554 case Intrinsic::bswap: {
555 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
556 MVT::v4i32, MVT::v2i64};
558 if (
any_of(ValidAbsTys, [<](
MVT M) {
return M == LT.second; }) &&
559 LT.second.getScalarSizeInBits() ==
RetTy->getScalarSizeInBits())
563 case Intrinsic::experimental_stepvector: {
572 Cost += AddCost * (LT.first - 1);
576 case Intrinsic::vector_extract:
577 case Intrinsic::vector_insert: {
590 bool IsExtract = ICA.
getID() == Intrinsic::vector_extract;
609 case Intrinsic::bitreverse: {
611 {Intrinsic::bitreverse, MVT::i32, 1},
612 {Intrinsic::bitreverse, MVT::i64, 1},
613 {Intrinsic::bitreverse, MVT::v8i8, 1},
614 {Intrinsic::bitreverse, MVT::v16i8, 1},
615 {Intrinsic::bitreverse, MVT::v4i16, 2},
616 {Intrinsic::bitreverse, MVT::v8i16, 2},
617 {Intrinsic::bitreverse, MVT::v2i32, 2},
618 {Intrinsic::bitreverse, MVT::v4i32, 2},
619 {Intrinsic::bitreverse, MVT::v1i64, 2},
620 {Intrinsic::bitreverse, MVT::v2i64, 2},
630 return LegalisationCost.first * Entry->Cost + 1;
632 return LegalisationCost.first * Entry->Cost;
636 case Intrinsic::ctpop: {
637 if (!ST->hasNEON()) {
658 RetTy->getScalarSizeInBits()
661 return LT.first * Entry->Cost + ExtraCost;
665 case Intrinsic::sadd_with_overflow:
666 case Intrinsic::uadd_with_overflow:
667 case Intrinsic::ssub_with_overflow:
668 case Intrinsic::usub_with_overflow:
669 case Intrinsic::smul_with_overflow:
670 case Intrinsic::umul_with_overflow: {
672 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
673 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
674 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
675 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
676 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
677 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
678 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
679 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
680 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
681 {Intrinsic::usub_with_overflow, MVT::i8, 3},
682 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
683 {Intrinsic::usub_with_overflow, MVT::i16, 3},
684 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
685 {Intrinsic::usub_with_overflow, MVT::i32, 1},
686 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
687 {Intrinsic::usub_with_overflow, MVT::i64, 1},
688 {Intrinsic::smul_with_overflow, MVT::i8, 5},
689 {Intrinsic::umul_with_overflow, MVT::i8, 4},
690 {Intrinsic::smul_with_overflow, MVT::i16, 5},
691 {Intrinsic::umul_with_overflow, MVT::i16, 4},
692 {Intrinsic::smul_with_overflow, MVT::i32, 2},
693 {Intrinsic::umul_with_overflow, MVT::i32, 2},
694 {Intrinsic::smul_with_overflow, MVT::i64, 3},
695 {Intrinsic::umul_with_overflow, MVT::i64, 3},
704 case Intrinsic::fptosi_sat:
705 case Intrinsic::fptoui_sat: {
708 bool IsSigned = ICA.
getID() == Intrinsic::fptosi_sat;
713 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
714 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
715 LT.second == MVT::v2f64) &&
717 (LT.second == MVT::f64 && MTy == MVT::i32) ||
718 (LT.second == MVT::f32 && MTy == MVT::i64)))
721 if (ST->hasFullFP16() &&
722 ((LT.second == MVT::f16 && MTy == MVT::i32) ||
723 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
728 if ((LT.second.getScalarType() == MVT::f32 ||
729 LT.second.getScalarType() == MVT::f64 ||
730 (ST->hasFullFP16() && LT.second.getScalarType() == MVT::f16)) &&
734 if (LT.second.isVector())
738 LegalTy, {LegalTy, LegalTy});
741 LegalTy, {LegalTy, LegalTy});
743 return LT.first *
Cost;
747 case Intrinsic::fshl:
748 case Intrinsic::fshr: {
761 {Intrinsic::fshl, MVT::v4i32, 3},
762 {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
763 {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
764 {Intrinsic::fshl, MVT::v8i8, 4}, {Intrinsic::fshl, MVT::v4i16, 4}};
770 return LegalisationCost.first * Entry->Cost;
774 if (!
RetTy->isIntegerTy())
779 bool HigherCost = (
RetTy->getScalarSizeInBits() != 32 &&
780 RetTy->getScalarSizeInBits() < 64) ||
781 (
RetTy->getScalarSizeInBits() % 64 != 0);
782 unsigned ExtraCost = HigherCost ? 1 : 0;
783 if (
RetTy->getScalarSizeInBits() == 32 ||
784 RetTy->getScalarSizeInBits() == 64)
791 return TyL.first + ExtraCost;
793 case Intrinsic::get_active_lane_mask: {
798 if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
809 return RetTy->getNumElements() * 2;
825 auto RequiredType = II.
getType();
828 assert(PN &&
"Expected Phi Node!");
831 if (!PN->hasOneUse())
834 for (
Value *IncValPhi : PN->incoming_values()) {
835 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
837 Reinterpret->getIntrinsicID() !=
838 Intrinsic::aarch64_sve_convert_to_svbool ||
839 RequiredType != Reinterpret->getArgOperand(0)->getType())
848 for (
unsigned I = 0;
I < PN->getNumIncomingValues();
I++) {
849 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(
I));
850 NPN->
addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(
I));
867static std::optional<Instruction *>
869 auto BinOp = dyn_cast<IntrinsicInst>(II.
getOperand(0));
873 auto IntrinsicID = BinOp->getIntrinsicID();
874 switch (IntrinsicID) {
875 case Intrinsic::aarch64_sve_and_z:
876 case Intrinsic::aarch64_sve_bic_z:
877 case Intrinsic::aarch64_sve_eor_z:
878 case Intrinsic::aarch64_sve_nand_z:
879 case Intrinsic::aarch64_sve_nor_z:
880 case Intrinsic::aarch64_sve_orn_z:
881 case Intrinsic::aarch64_sve_orr_z:
887 auto BinOpPred = BinOp->getOperand(0);
888 auto BinOpOp1 = BinOp->getOperand(1);
889 auto BinOpOp2 = BinOp->getOperand(2);
891 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
893 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
896 auto PredOp = PredIntr->getOperand(0);
897 auto PredOpTy = cast<VectorType>(PredOp->getType());
903 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
904 NarrowedBinOpArgs.
push_back(NarrowBinOpOp1);
905 if (BinOpOp1 == BinOpOp2)
906 NarrowedBinOpArgs.
push_back(NarrowBinOpOp1);
909 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
916static std::optional<Instruction *>
927 isa<TargetExtType>(II.
getType()))
933 const auto *IVTy = cast<VectorType>(II.
getType());
939 const auto *CursorVTy = cast<VectorType>(Cursor->
getType());
940 if (CursorVTy->getElementCount().getKnownMinValue() <
941 IVTy->getElementCount().getKnownMinValue())
946 EarliestReplacement = Cursor;
948 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
951 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
952 Intrinsic::aarch64_sve_convert_to_svbool ||
953 IntrinsicCursor->getIntrinsicID() ==
954 Intrinsic::aarch64_sve_convert_from_svbool))
957 CandidatesForRemoval.
insert(CandidatesForRemoval.
begin(), IntrinsicCursor);
958 Cursor = IntrinsicCursor->getOperand(0);
963 if (!EarliestReplacement)
972 if (
match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
973 m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
977 if (cast<ScalableVectorType>(Pred->
getType())->getMinNumElements() <=
978 cast<ScalableVectorType>(UncastedPred->
getType())->getMinNumElements())
981 return match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
982 m_ConstantInt<AArch64SVEPredPattern::all>()));
1001 return std::nullopt;
1004 return std::nullopt;
1006 const auto PTruePattern =
1007 cast<ConstantInt>(Pg->
getOperand(0))->getZExtValue();
1008 if (PTruePattern != AArch64SVEPredPattern::vl1)
1009 return std::nullopt;
1015 Insert->insertBefore(&II);
1016 Insert->takeName(&II);
1027 Splat->takeName(&II);
1037 if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1038 return std::nullopt;
1040 const auto PTruePattern =
1041 cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
1042 if (PTruePattern != AArch64SVEPredPattern::all)
1043 return std::nullopt;
1048 if (!SplatValue || !SplatValue->isZero())
1049 return std::nullopt;
1052 auto *DupQLane = dyn_cast<IntrinsicInst>(II.
getArgOperand(1));
1054 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
1055 return std::nullopt;
1058 if (!cast<ConstantInt>(DupQLane->getArgOperand(1))->isZero())
1059 return std::nullopt;
1061 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
1062 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
1063 return std::nullopt;
1067 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
1068 return std::nullopt;
1070 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
1071 return std::nullopt;
1073 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
1075 return std::nullopt;
1077 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
1078 auto *OutTy = dyn_cast<ScalableVectorType>(II.
getType());
1079 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
1080 return std::nullopt;
1082 unsigned NumElts = VecTy->getNumElements();
1083 unsigned PredicateBits = 0;
1086 for (
unsigned I = 0;
I < NumElts; ++
I) {
1087 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(
I));
1089 return std::nullopt;
1091 PredicateBits |= 1 << (
I * (16 / NumElts));
1095 if (PredicateBits == 0) {
1097 PFalse->takeName(&II);
1103 for (
unsigned I = 0;
I < 16; ++
I)
1104 if ((PredicateBits & (1 <<
I)) != 0)
1107 unsigned PredSize = Mask & -Mask;
1112 for (
unsigned I = 0;
I < 16;
I += PredSize)
1113 if ((PredicateBits & (1 <<
I)) == 0)
1114 return std::nullopt;
1119 {PredType}, {PTruePat});
1121 Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue});
1122 auto *ConvertFromSVBool =
1124 {II.
getType()}, {ConvertToSVBool});
1135 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
1146 auto *OldBinOp = cast<BinaryOperator>(Vec);
1147 auto OpC = OldBinOp->getOpcode();
1153 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.
getIterator());
1158 auto *
C = dyn_cast<Constant>(Pg);
1159 if (IsAfter &&
C &&
C->isNullValue()) {
1163 Extract->insertBefore(&II);
1164 Extract->takeName(&II);
1168 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
1170 return std::nullopt;
1172 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
1173 return std::nullopt;
1175 const auto PTruePattern =
1176 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
1181 return std::nullopt;
1183 unsigned Idx = MinNumElts - 1;
1192 auto *PgVTy = cast<ScalableVectorType>(Pg->
getType());
1193 if (
Idx >= PgVTy->getMinNumElements())
1194 return std::nullopt;
1199 Extract->insertBefore(&II);
1200 Extract->takeName(&II);
1219 return std::nullopt;
1224 return std::nullopt;
1238 FPTy, cast<VectorType>(Vec->
getType())->getElementCount());
1241 II.
getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
1261static std::optional<Instruction *>
1265 if (
Pattern == AArch64SVEPredPattern::all) {
1274 return MinNumElts && NumElts >= MinNumElts
1276 II, ConstantInt::get(II.
getType(), MinNumElts)))
1287 if (PgVal == OpVal &&
1290 Value *Ops[] = {PgVal, OpVal};
1304 return std::nullopt;
1308 if (Pg->
getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
1309 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
1323 if ((Pg ==
Op) && (II.
getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
1324 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
1325 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
1326 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
1327 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
1328 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
1329 (OpIID == Intrinsic::aarch64_sve_and_z) ||
1330 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
1331 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
1332 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
1333 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
1334 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
1335 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
1345 return std::nullopt;
1348template <Intrinsic::ID MulOpc,
typename Intrinsic::ID FuseOpc>
1349static std::optional<Instruction *>
1351 bool MergeIntoAddendOp) {
1353 Value *MulOp0, *MulOp1, *AddendOp, *
Mul;
1354 if (MergeIntoAddendOp) {
1364 return std::nullopt;
1366 if (!
Mul->hasOneUse())
1367 return std::nullopt;
1375 return std::nullopt;
1377 return std::nullopt;
1382 if (MergeIntoAddendOp)
1384 {
P, AddendOp, MulOp0, MulOp1}, FMFSource);
1387 {
P, MulOp0, MulOp1, AddendOp}, FMFSource);
1392static std::optional<Instruction *>
1400 Load->copyMetadata(II);
1411static std::optional<Instruction *>
1419 Store->copyMetadata(II);
1430 switch (Intrinsic) {
1431 case Intrinsic::aarch64_sve_fmul_u:
1432 return Instruction::BinaryOps::FMul;
1433 case Intrinsic::aarch64_sve_fadd_u:
1434 return Instruction::BinaryOps::FAdd;
1435 case Intrinsic::aarch64_sve_fsub_u:
1436 return Instruction::BinaryOps::FSub;
1438 return Instruction::BinaryOpsEnd;
1442static std::optional<Instruction *>
1446 return std::nullopt;
1450 if (BinOpCode == Instruction::BinaryOpsEnd ||
1451 !
match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1452 m_ConstantInt<AArch64SVEPredPattern::all>())))
1453 return std::nullopt;
1466 if (!
match(OpPredicate, m_Intrinsic<Intrinsic::aarch64_sve_ptrue>(
1467 m_ConstantInt<AArch64SVEPredPattern::all>())))
1468 return std::nullopt;
1479static std::optional<Instruction *>
1496 Intrinsic::aarch64_sve_mla>(
1500 Intrinsic::aarch64_sve_mad>(
1503 return std::nullopt;
1506static std::optional<Instruction *>
1513 Intrinsic::aarch64_sve_fmla>(IC, II,
1518 Intrinsic::aarch64_sve_fmad>(IC, II,
1523 Intrinsic::aarch64_sve_fmla>(IC, II,
1526 return std::nullopt;
1529static std::optional<Instruction *>
1533 Intrinsic::aarch64_sve_fmla>(IC, II,
1538 Intrinsic::aarch64_sve_fmad>(IC, II,
1543 Intrinsic::aarch64_sve_fmla_u>(
1549static std::optional<Instruction *>
1556 Intrinsic::aarch64_sve_fmls>(IC, II,
1561 Intrinsic::aarch64_sve_fnmsb>(
1566 Intrinsic::aarch64_sve_fmls>(IC, II,
1569 return std::nullopt;
1572static std::optional<Instruction *>
1576 Intrinsic::aarch64_sve_fmls>(IC, II,
1581 Intrinsic::aarch64_sve_fnmsb>(
1586 Intrinsic::aarch64_sve_fmls_u>(
1598 Intrinsic::aarch64_sve_mls>(
1601 return std::nullopt;
1612 auto IsUnitSplat = [](
auto *
I) {
1621 auto IsUnitDup = [](
auto *
I) {
1622 auto *IntrI = dyn_cast<IntrinsicInst>(
I);
1623 if (!IntrI || IntrI->getIntrinsicID() != Intrinsic::aarch64_sve_dup)
1626 auto *SplatValue = IntrI->getOperand(2);
1630 if (IsUnitSplat(OpMultiplier)) {
1632 OpMultiplicand->takeName(&II);
1634 }
else if (IsUnitDup(OpMultiplier)) {
1636 auto *DupInst = cast<IntrinsicInst>(OpMultiplier);
1637 auto *DupPg = DupInst->getOperand(1);
1640 if (OpPredicate == DupPg) {
1641 OpMultiplicand->takeName(&II);
1653 bool IsSigned = II.
getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
1667 return std::nullopt;
1677 auto *SplatValue = dyn_cast_or_null<ConstantInt>(
getSplatValue(OpIndices));
1679 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
1680 return std::nullopt;
1696 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
1697 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
1702 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(
m_Value(
A)))) &&
1704 m_Intrinsic<FromSVB>(m_Intrinsic<ToSVB>(
m_Value(
B))))) ||
1707 auto *TyA = cast<ScalableVectorType>(
A->getType());
1708 if (TyA ==
B->getType() &&
1719 return std::nullopt;
1734 return std::nullopt;
1737static std::optional<Instruction *>
1749 if (
match(
Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1756 BasePtr, IndexBase);
1764 return std::nullopt;
1767static std::optional<Instruction *>
1779 if (
match(
Index, m_Intrinsic<Intrinsic::aarch64_sve_index>(
1785 BasePtr, IndexBase);
1794 return std::nullopt;
1805 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
1806 if (!SplatConstantInt)
1807 return std::nullopt;
1813 Intrinsic::aarch64_sve_asrd, {II.
getType()}, {Pred, Vec, DivisorLog2});
1820 Intrinsic::aarch64_sve_asrd, {II.
getType()}, {Pred, Vec, DivisorLog2});
1822 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
1826 return std::nullopt;
1830 size_t VecSize = Vec.
size();
1835 size_t HalfVecSize = VecSize / 2;
1839 if (*
LHS !=
nullptr && *
RHS !=
nullptr) {
1847 if (*
LHS ==
nullptr && *
RHS !=
nullptr)
1862 m_Intrinsic<Intrinsic::vector_insert>(
1864 !isa<FixedVectorType>(CurrentInsertElt->
getType()))
1865 return std::nullopt;
1866 auto IIScalableTy = cast<ScalableVectorType>(II.
getType());
1870 while (
auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
1871 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
1872 Elts[
Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
1873 CurrentInsertElt = InsertElt->getOperand(0);
1877 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(
Default);
1879 return std::nullopt;
1883 for (
size_t I = 0;
I < Elts.
size();
I++) {
1884 if (Elts[
I] ==
nullptr)
1889 if (InsertEltChain ==
nullptr)
1890 return std::nullopt;
1896 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.
size();
1897 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
1898 IIScalableTy->getMinNumElements() /
1903 auto *WideShuffleMaskTy =
1914 auto NarrowBitcast =
1927 return std::nullopt;
1937 Value *AbsPred, *MergedValue;
1938 if (!
match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_sqabs>(
1940 !
match(Vec, m_Intrinsic<Intrinsic::aarch64_sve_abs>(
1943 return std::nullopt;
1951 return std::nullopt;
1956 return std::nullopt;
1959 {II.
getType()}, {Pred, Vec, Shift});
1964std::optional<Instruction *>
1971 case Intrinsic::aarch64_neon_fmaxnm:
1972 case Intrinsic::aarch64_neon_fminnm:
1974 case Intrinsic::aarch64_sve_convert_from_svbool:
1976 case Intrinsic::aarch64_sve_dup:
1978 case Intrinsic::aarch64_sve_dup_x:
1980 case Intrinsic::aarch64_sve_cmpne:
1981 case Intrinsic::aarch64_sve_cmpne_wide:
1983 case Intrinsic::aarch64_sve_rdffr:
1985 case Intrinsic::aarch64_sve_lasta:
1986 case Intrinsic::aarch64_sve_lastb:
1988 case Intrinsic::aarch64_sve_clasta_n:
1989 case Intrinsic::aarch64_sve_clastb_n:
1991 case Intrinsic::aarch64_sve_cntd:
1993 case Intrinsic::aarch64_sve_cntw:
1995 case Intrinsic::aarch64_sve_cnth:
1997 case Intrinsic::aarch64_sve_cntb:
1999 case Intrinsic::aarch64_sve_ptest_any:
2000 case Intrinsic::aarch64_sve_ptest_first:
2001 case Intrinsic::aarch64_sve_ptest_last:
2003 case Intrinsic::aarch64_sve_fabd:
2005 case Intrinsic::aarch64_sve_fadd:
2007 case Intrinsic::aarch64_sve_fadd_u:
2009 case Intrinsic::aarch64_sve_fdiv:
2011 case Intrinsic::aarch64_sve_fmax:
2013 case Intrinsic::aarch64_sve_fmaxnm:
2015 case Intrinsic::aarch64_sve_fmin:
2017 case Intrinsic::aarch64_sve_fminnm:
2019 case Intrinsic::aarch64_sve_fmla:
2021 case Intrinsic::aarch64_sve_fmls:
2023 case Intrinsic::aarch64_sve_fmul:
2028 case Intrinsic::aarch64_sve_fmul_u:
2030 case Intrinsic::aarch64_sve_fmulx:
2032 case Intrinsic::aarch64_sve_fnmla:
2034 case Intrinsic::aarch64_sve_fnmls:
2036 case Intrinsic::aarch64_sve_fsub:
2038 case Intrinsic::aarch64_sve_fsub_u:
2040 case Intrinsic::aarch64_sve_add:
2042 case Intrinsic::aarch64_sve_add_u:
2044 Intrinsic::aarch64_sve_mla_u>(
2046 case Intrinsic::aarch64_sve_mla:
2048 case Intrinsic::aarch64_sve_mls:
2050 case Intrinsic::aarch64_sve_mul:
2055 case Intrinsic::aarch64_sve_mul_u:
2057 case Intrinsic::aarch64_sve_sabd:
2059 case Intrinsic::aarch64_sve_smax:
2061 case Intrinsic::aarch64_sve_smin:
2063 case Intrinsic::aarch64_sve_smulh:
2065 case Intrinsic::aarch64_sve_sub:
2067 case Intrinsic::aarch64_sve_sub_u:
2069 Intrinsic::aarch64_sve_mls_u>(
2071 case Intrinsic::aarch64_sve_uabd:
2073 case Intrinsic::aarch64_sve_umax:
2075 case Intrinsic::aarch64_sve_umin:
2077 case Intrinsic::aarch64_sve_umulh:
2079 case Intrinsic::aarch64_sve_asr:
2081 case Intrinsic::aarch64_sve_lsl:
2083 case Intrinsic::aarch64_sve_lsr:
2085 case Intrinsic::aarch64_sve_and:
2087 case Intrinsic::aarch64_sve_bic:
2089 case Intrinsic::aarch64_sve_eor:
2091 case Intrinsic::aarch64_sve_orr:
2093 case Intrinsic::aarch64_sve_sqsub:
2095 case Intrinsic::aarch64_sve_uqsub:
2097 case Intrinsic::aarch64_sve_tbl:
2099 case Intrinsic::aarch64_sve_uunpkhi:
2100 case Intrinsic::aarch64_sve_uunpklo:
2101 case Intrinsic::aarch64_sve_sunpkhi:
2102 case Intrinsic::aarch64_sve_sunpklo:
2104 case Intrinsic::aarch64_sve_uzp1:
2106 case Intrinsic::aarch64_sve_zip1:
2107 case Intrinsic::aarch64_sve_zip2:
2109 case Intrinsic::aarch64_sve_ld1_gather_index:
2111 case Intrinsic::aarch64_sve_st1_scatter_index:
2113 case Intrinsic::aarch64_sve_ld1:
2115 case Intrinsic::aarch64_sve_st1:
2117 case Intrinsic::aarch64_sve_sdiv:
2119 case Intrinsic::aarch64_sve_sel:
2121 case Intrinsic::aarch64_sve_srshl:
2123 case Intrinsic::aarch64_sve_dupq_lane:
2127 return std::nullopt;
2134 SimplifyAndSetOp)
const {
2138 case Intrinsic::aarch64_neon_fcvtxn:
2139 case Intrinsic::aarch64_neon_rshrn:
2140 case Intrinsic::aarch64_neon_sqrshrn:
2141 case Intrinsic::aarch64_neon_sqrshrun:
2142 case Intrinsic::aarch64_neon_sqshrn:
2143 case Intrinsic::aarch64_neon_sqshrun:
2144 case Intrinsic::aarch64_neon_sqxtn:
2145 case Intrinsic::aarch64_neon_sqxtun:
2146 case Intrinsic::aarch64_neon_uqrshrn:
2147 case Intrinsic::aarch64_neon_uqshrn:
2148 case Intrinsic::aarch64_neon_uqxtn:
2149 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
2153 return std::nullopt;
2179bool AArch64TTIImpl::isWideningInstruction(
Type *DstTy,
unsigned Opcode,
2181 Type *SrcOverrideTy) {
2184 auto toVectorTy = [&](
Type *ArgTy) {
2186 cast<VectorType>(DstTy)->getElementCount());
2196 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
2206 Type *SrcTy = SrcOverrideTy;
2208 case Instruction::Add:
2209 case Instruction::Sub:
2211 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
2214 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->
getType());
2218 case Instruction::Mul: {
2220 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
2221 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
2224 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->
getType());
2225 }
else if (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1])) {
2254 assert(SrcTy &&
"Expected some SrcTy");
2256 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
2262 DstTyL.first * DstTyL.second.getVectorMinNumElements();
2264 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
2268 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
2281 (Src->isScalableTy() && !ST->hasSVE2()))
2290 dyn_cast_or_null<Instruction>(
Add->getUniqueUndroppableUser());
2291 if (AddUser && AddUser->getOpcode() == Instruction::Add)
2294 auto *Shr = dyn_cast_or_null<Instruction>(
Add->getUniqueUndroppableUser());
2295 if (!Shr || Shr->getOpcode() != Instruction::LShr)
2298 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
2299 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
2300 Src->getScalarSizeInBits() !=
2301 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
2325 assert(ISD &&
"Invalid opcode");
2328 if (
I &&
I->hasOneUser()) {
2329 auto *SingleUser = cast<Instruction>(*
I->user_begin());
2331 if (isWideningInstruction(Dst, SingleUser->getOpcode(),
Operands, Src)) {
2335 if (SingleUser->getOpcode() == Instruction::Add) {
2336 if (
I == SingleUser->getOperand(1) ||
2337 (isa<CastInst>(SingleUser->getOperand(1)) &&
2338 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
2345 if ((isa<ZExtInst>(
I) || isa<SExtInst>(
I)) &&
2353 return Cost == 0 ? 0 : 1;
2638 EVT WiderTy = SrcTy.
bitsGT(DstTy) ? SrcTy : DstTy;
2642 std::pair<InstructionCost, MVT> LT =
2645 LT.second.getVectorElementType().getSizeInBits();
2657 return AdjustCost(Entry->Cost);
2684 if (ST->hasFullFP16())
2687 return AdjustCost(Entry->Cost);
2702 Opcode, LegalTy, Src, CCH,
CostKind,
I);
2705 return Part1 + Part2;
2725 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
2733 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) &&
"Invalid type");
2749 if (!VecLT.second.isVector() || !TLI->
isTypeLegal(DstVT))
2755 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
2765 case Instruction::SExt:
2770 case Instruction::ZExt:
2771 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
2784 return Opcode == Instruction::PHI ? 0 : 1;
2801 if (!LT.second.isVector())
2806 if (LT.second.isFixedLengthVector()) {
2807 unsigned Width = LT.second.getVectorNumElements();
2824 if (
I && dyn_cast<LoadInst>(
I->getOperand(1)))
2848 Opcode == Instruction::InsertElement && Op0 && !isa<UndefValue>(Op0);
2849 return getVectorInstrCostHelper(
nullptr, Val,
Index, HasRealUse);
2856 return getVectorInstrCostHelper(&
I, Val,
Index,
true );
2862 if (isa<ScalableVectorType>(Ty))
2867 return DemandedElts.
popcount() * (Insert + Extract) *
2880 Op2Info, Args, CxtI);
2922 return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
2927 Opcode, Ty,
CostKind, Op1Info, Op2Info);
2932 if (isa<FixedVectorType>(Ty) && cast<FixedVectorType>(Ty)
2933 ->getPrimitiveSizeInBits()
2934 .getFixedValue() < 128) {
2945 if (
nullptr != Entry)
2950 if (LT.second.getScalarType() == MVT::i8)
2952 else if (LT.second.getScalarType() == MVT::i16)
2962 if (
auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
2965 return (4 + DivCost) * VTy->getNumElements();
2985 if (LT.second == MVT::v2i64 && ST->hasSVE())
3000 if (LT.second != MVT::v2i64 || isWideningInstruction(Ty, Opcode, Args))
3002 return LT.first * 14;
3021 return 2 * LT.first;
3030 return 2 * LT.first;
3052 int MaxMergeDistance = 64;
3056 return NumVectorInstToHideOverhead;
3076 if (isa<FixedVectorType>(ValTy) && ISD ==
ISD::SELECT) {
3078 const int AmortizationCost = 20;
3086 VecPred = CurrentPred;
3094 static const auto ValidMinMaxTys = {
3095 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
3096 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
3097 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
3100 if (
any_of(ValidMinMaxTys, [<](
MVT M) {
return M == LT.second; }) ||
3101 (ST->hasFullFP16() &&
3102 any_of(ValidFP16MinMaxTys, [<](
MVT M) {
return M == LT.second; })))
3107 VectorSelectTbl[] = {
3116 {
ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
3117 {
ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
3118 {
ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
3131 if (isa<FixedVectorType>(ValTy) && ISD ==
ISD::SETCC) {
3134 if (LT.second == MVT::v4f16 && !ST->hasFullFP16())
3135 return LT.first * 4;
3156 if (ST->requiresStrictAlign()) {
3161 Options.AllowOverlappingLoads =
true;
3167 Options.LoadSizes = {8, 4, 2, 1};
3168 Options.AllowedTailExpansions = {3, 5, 6};
3173 return ST->hasSVE();
3184 if (!LT.first.isValid())
3202 unsigned Opcode,
Type *DataTy,
const Value *
Ptr,
bool VariableMask,
3207 auto *VT = cast<VectorType>(DataTy);
3209 if (!LT.first.isValid())
3212 if (!LT.second.isVector() ||
3220 if (cast<VectorType>(DataTy)->getElementCount() ==
3224 ElementCount LegalVF = LT.second.getVectorElementCount();
3227 {TTI::OK_AnyValue, TTI::OP_None},
I);
3247 if (VT == MVT::Other)
3252 if (!LT.first.isValid())
3259 if (
auto *VTy = dyn_cast<ScalableVectorType>(Ty))
3270 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
3271 LT.second.is128BitVector() && (!Alignment || *Alignment <
Align(16))) {
3277 const int AmortizationCost = 6;
3279 return LT.first * 2 * AmortizationCost;
3290 if (VT == MVT::v4i8)
3293 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
3297 if (!
isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
3299 *Alignment !=
Align(1))
3313 while (!TypeWorklist.
empty()) {
3335 bool UseMaskForCond,
bool UseMaskForGaps) {
3336 assert(Factor >= 2 &&
"Invalid interleave factor");
3337 auto *VecVTy = cast<VectorType>(VecTy);
3339 if (VecTy->
isScalableTy() && (!ST->hasSVE() || Factor != 2))
3344 if (!VecTy->
isScalableTy() && (UseMaskForCond || UseMaskForGaps))
3347 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
3348 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
3351 VecVTy->getElementCount().divideCoefficientBy(Factor));
3357 if (MinElts % Factor == 0 &&
3364 UseMaskForCond, UseMaskForGaps);
3371 for (
auto *
I : Tys) {
3372 if (!
I->isVectorTy())
3374 if (
I->getScalarSizeInBits() * cast<FixedVectorType>(
I)->getNumElements() ==
3393 enum { MaxStridedLoads = 7 };
3395 int StridedLoads = 0;
3398 for (
const auto BB : L->blocks()) {
3399 for (
auto &
I : *BB) {
3400 LoadInst *LMemI = dyn_cast<LoadInst>(&
I);
3405 if (L->isLoopInvariant(PtrValue))
3409 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
3410 if (!LSCEVAddRec || !LSCEVAddRec->
isAffine())
3419 if (StridedLoads > MaxStridedLoads / 2)
3420 return StridedLoads;
3423 return StridedLoads;
3426 int StridedLoads = countStridedLoads(L, SE);
3428 <<
" strided loads\n");
3449 if (L->getLoopDepth() > 1)
3462 for (
auto *BB : L->getBlocks()) {
3463 for (
auto &
I : *BB) {
3465 if (
I.getType()->isVectorTy())
3468 if (isa<CallInst>(
I) || isa<InvokeInst>(
I)) {
3483 !ST->getSchedModel().isOutOfOrder()) {
3500 Type *ExpectedType) {
3504 case Intrinsic::aarch64_neon_st2:
3505 case Intrinsic::aarch64_neon_st3:
3506 case Intrinsic::aarch64_neon_st4: {
3508 StructType *ST = dyn_cast<StructType>(ExpectedType);
3511 unsigned NumElts = Inst->
arg_size() - 1;
3512 if (ST->getNumElements() != NumElts)
3514 for (
unsigned i = 0, e = NumElts; i != e; ++i) {
3520 for (
unsigned i = 0, e = NumElts; i != e; ++i) {
3526 case Intrinsic::aarch64_neon_ld2:
3527 case Intrinsic::aarch64_neon_ld3:
3528 case Intrinsic::aarch64_neon_ld4:
3529 if (Inst->
getType() == ExpectedType)
3540 case Intrinsic::aarch64_neon_ld2:
3541 case Intrinsic::aarch64_neon_ld3:
3542 case Intrinsic::aarch64_neon_ld4:
3543 Info.ReadMem =
true;
3544 Info.WriteMem =
false;
3547 case Intrinsic::aarch64_neon_st2:
3548 case Intrinsic::aarch64_neon_st3:
3549 case Intrinsic::aarch64_neon_st4:
3550 Info.ReadMem =
false;
3551 Info.WriteMem =
true;
3559 case Intrinsic::aarch64_neon_ld2:
3560 case Intrinsic::aarch64_neon_st2:
3561 Info.MatchingId = VECTOR_LDST_TWO_ELEMENTS;
3563 case Intrinsic::aarch64_neon_ld3:
3564 case Intrinsic::aarch64_neon_st3:
3565 Info.MatchingId = VECTOR_LDST_THREE_ELEMENTS;
3567 case Intrinsic::aarch64_neon_ld4:
3568 case Intrinsic::aarch64_neon_st4:
3569 Info.MatchingId = VECTOR_LDST_FOUR_ELEMENTS;
3581 const Instruction &
I,
bool &AllowPromotionWithoutCommonHeader) {
3582 bool Considerable =
false;
3583 AllowPromotionWithoutCommonHeader =
false;
3584 if (!isa<SExtInst>(&
I))
3586 Type *ConsideredSExtType =
3588 if (
I.getType() != ConsideredSExtType)
3592 for (
const User *U :
I.users()) {
3594 Considerable =
true;
3598 if (GEPInst->getNumOperands() > 2) {
3599 AllowPromotionWithoutCommonHeader =
true;
3604 return Considerable;
3643 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
3653 return LegalizationCost + 2;
3663 LegalizationCost *= LT.first - 1;
3667 assert(ISD &&
"Invalid opcode");
3675 return LegalizationCost + 2;
3683 std::optional<FastMathFlags> FMF,
3686 if (
auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
3691 return BaseCost + FixedVTy->getNumElements();
3694 if (Opcode != Instruction::FAdd)
3697 auto *VTy = cast<ScalableVectorType>(ValTy);
3704 if (isa<ScalableVectorType>(ValTy))
3708 MVT MTy = LT.second;
3710 assert(ISD &&
"Invalid opcode");
3754 return (LT.first - 1) + Entry->Cost;
3762 auto *ValVTy = cast<FixedVectorType>(ValTy);
3766 if (LT.first != 1) {
3772 ExtraCost *= LT.first - 1;
3775 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
3776 return Cost + ExtraCost;
3810 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
3825 if (LT.second.getScalarType() == MVT::i1) {
3834 assert(Entry &&
"Illegal Type for Splice");
3835 LegalizationCost += Entry->Cost;
3836 return LegalizationCost * LT.first;
3847 if (!Mask.empty() && isa<FixedVectorType>(Tp) && LT.second.isVector() &&
3849 Mask.size() > LT.second.getVectorNumElements() && !
Index && !SubTp) {
3855 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
3858 return std::max<InstructionCost>(1, LT.first / 4);
3871 unsigned TpNumElts = Mask.size();
3872 unsigned LTNumElts = LT.second.getVectorNumElements();
3873 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
3877 for (
unsigned N = 0;
N < NumVecs;
N++) {
3881 unsigned Source1, Source2;
3882 unsigned NumSources = 0;
3883 for (
unsigned E = 0; E < LTNumElts; E++) {
3884 int MaskElt = (
N * LTNumElts + E < TpNumElts) ? Mask[
N * LTNumElts + E]
3893 unsigned Source = MaskElt / LTNumElts;
3894 if (NumSources == 0) {
3897 }
else if (NumSources == 1 && Source != Source1) {
3900 }
else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
3906 if (Source == Source1)
3908 else if (Source == Source2)
3909 NMask.
push_back(MaskElt % LTNumElts + LTNumElts);
3915 if (NumSources <= 2)
3918 NTp, NMask,
CostKind, 0,
nullptr, Args, CxtI);
3920 return ME.value() % LTNumElts == ME.index();
3922 Cost += LTNumElts - 1;
3932 if (IsExtractSubvector && LT.second.isFixedLengthVector())
3943 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
3944 if (IsLoad && LT.second.isVector() &&
3946 LT.second.getVectorElementCount()))
3954 all_of(Mask, [](
int E) {
return E < 8; }))
3958 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
3961 return M.value() < 0 || M.value() == (
int)M.index();
3968 if (LT.second.isFixedLengthVector() &&
3969 LT.second.getVectorNumElements() == Mask.size() &&
3975 [&Mask](
int M) {
return M < 0 || M == Mask[0]; })))
4098 return LT.first * Entry->Cost;
4107 LT.second.getSizeInBits() <= 128 && SubTp) {
4109 if (SubLT.second.isVector()) {
4110 int NumElts = LT.second.getVectorNumElements();
4111 int NumSubElts = SubLT.second.getVectorNumElements();
4112 if ((
Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
4118 if (IsExtractSubvector)
4131 if (isa<LoadInst>(&
I) || isa<StoreInst>(&
I)) {
4175 unsigned NumInsns = 0;
4177 NumInsns += BB->sizeWithoutDebug();
4186 int64_t BaseOffset,
bool HasBaseReg,
4187 int64_t Scale,
unsigned AddrSpace)
const {
4213 isa<BranchInst>(
I->getNextNode()) &&
4214 cast<BranchInst>(
I->getNextNode())->isUnconditional())
static bool isAllActivePredicate(SelectionDAG &DAG, SDValue N)
SmallVector< AArch64_IMM::ImmInsnModel, 4 > Insn
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
amdgpu AMDGPU Register Bank Select
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
Cost tables and simple lookup functions.
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
This file provides the interface for the instcombine pass implementation.
This file defines the LoopVectorizationLegality class.
mir Rename Register Operands
static const Function * getCalledFunction(const Value *V, bool &IsNoBuiltin)
const char LLVMTargetMachineRef TM
static uint64_t getBits(uint64_t Val, int Start, int End)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static unsigned getFastMathFlags(const MachineInstr &I)
static SymbolRef::Type getType(const Symbol *Sym)
This file describes how to lower LLVM code to machine code.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
bool isNeonAvailable() const
Returns true if the target has NEON and the function at runtime is known to have NEON enabled (e....
unsigned getVectorInsertExtractBaseCost() const
ARMProcFamilyEnum getProcFamily() const
Returns ARM processor family.
unsigned getMaxInterleaveFactor() const
TailFoldingOpts getSVETailFoldingDefaultOpts() const
bool useSVEForFixedLengthVectors() const
bool isSVEAvailable() const
Returns true if the target has SVE and can use the full range of SVE instructions,...
unsigned getMinSVEVectorSizeInBits() const
InstructionCost getSpliceCost(VectorType *Tp, int Index)
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
bool shouldTreatInstructionLikeSelect(const Instruction *I)
InstructionCost getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr)
bool prefersVectorizedAddressing() const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index)
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType)
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader)
See if I should be considered for address type promotion.
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind)
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
bool isElementTypeLegalForScalableVector(Type *Ty) const
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys)
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool useNeonVector(const Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
bool areTypesABICompatible(const Function *Caller, const Function *Callee, const ArrayRef< Type * > &Types) const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool preferPredicateOverEpilogue(TailFoldingInfo *TFI)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool isLegalMaskedGatherScatter(Type *DataType) const
unsigned getMaxInterleaveFactor(ElementCount VF)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getIntImmCost(int64_t Val)
Calculate the cost of materializing a 64-bit value.
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info)
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src)
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind)
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
EVT getPromotedVTForPredicate(EVT VT) const
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, bool UseScalable) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const
Returns true if VecTy is a legal interleaved access type.
Class for arbitrary precision integers.
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
unsigned popcount() const
Count the number of bits set.
unsigned countLeadingOnes() const
void negate()
Negate this APInt in place.
APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
unsigned logBase2() const
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
int64_t getSExtValue() const
Get sign extended value.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
LLVM Basic Block Representation.
bool isTypeLegal(Type *Ty)
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false)
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind)
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr)
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind)
Estimate the overhead of scalarizing an instruction.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind)
Compute a cost of the given call instruction.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0)
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name, BasicBlock::iterator InsertBefore)
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
bool isStrictFP() const
Determine if the call requires strict floating point semantics.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
@ FCMP_OLT
0 1 0 0 True if ordered and less than
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
bool isIntPredicate() const
static ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
const APInt & getValue() const
Return the constant as an APInt value reference.
This is an important base class in LLVM.
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
static constexpr ElementCount getScalable(ScalarTy MinVal)
static constexpr ElementCount getFixed(ScalarTy MinVal)
Convenience struct for specifying and reasoning about fast-math flags.
bool allowContract() const
Container class for subtarget features.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
bool isEquality() const
Return true if this predicate is either EQ or NE.
Value * CreateVScale(Constant *Scaling, const Twine &Name="")
Create a call to llvm.vscale, multiplied by Scaling.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Value * CreateInsertValue(Value *Agg, Value *Val, ArrayRef< unsigned > Idxs, const Twine &Name="")
CallInst * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", bool IsInBounds=false)
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
static InsertElementInst * Create(Value *Vec, Value *NewElt, Value *Idx, const Twine &NameStr, BasicBlock::iterator InsertBefore)
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
static InstructionCost getInvalid(CostType Val=0)
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
Type * getReturnType() const
const SmallVectorImpl< const Value * > & getArgs() const
Intrinsic::ID getID() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool requiresSMChange(const SMEAttrs &Callee) const
void set(unsigned M, bool Enable=true)
bool hasStreamingBody() const
static ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
static bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
iterator insert(iterator I, T &&Elt)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Class to represent struct types.
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
unsigned getMaxExpandSizeMemcmp(bool OptSize) const
Get maximum # of load operations permitted for memcmp.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
LegalizeKind getTypeConversion(LLVMContext &Context, EVT VT) const
Return pair that represents the legalization kind (first) that needs to happen to EVT (second) in ord...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
Primary interface to the complete machine description for the target machine.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
The instances of the Type class are immutable: once they are created, they are never changed.
bool isVectorTy() const
True if this is an instance of VectorType.
static IntegerType * getInt1Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
bool isFP128Ty() const
Return true if this is 'fp128'.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
bool isPtrOrPtrVectorTy() const
Return true if this is a pointer type or a vector of pointer types.
bool isScalableTy() const
Return true if this is a type whose size is a known multiple of vscale.
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
bool isIntegerTy() const
True if this is an instance of IntegerType.
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
user_iterator user_begin()
bool hasOneUse() const
Return true if there is exactly one use of this value.
Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
LLVMContext & getContext() const
All values hold a context through their type.
void takeName(Value *V)
Transfer the name from V to this value.
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
static constexpr unsigned SVEBitsPerBlock
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ ADD
Simple integer binary arithmetic operators.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ FADD
Simple binary floating point operators.
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ SIGN_EXTEND
Conversion operators.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ SHL
Shift and rotation operations.
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
cst_pred_ty< is_zero_int > m_ZeroInt()
Match an integer 0 or a vector with all elements equal to 0.
OneUse_match< T > m_OneUse(const T &SubPattern)
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
bool isZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResultOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> or <4, 12,...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
bool isUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are are tuples (A,...
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void report_fatal_error(Error Err, bool gen_crash_diag=true)
Report a serious error, calling any installed error handler.
std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool Assume=false, bool ShouldCheckWrap=true)
If the pointer has a constant stride return it in units of the access type size.
static unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
constexpr int PoisonMaskElem
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
@ Mod
The access may modify the value stored in memory.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ FAnyOf
Any_of reduction with select(fcmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Xor
Bitwise or logical XOR of integers.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ IAnyOf
Any_of reduction with select(icmp(),x,y) where one of (x,y) is loop invariant, and both x and y are i...
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
@ Default
The result values are uniform if and only if all operands are uniform.
Type * getLoadStoreType(Value *I)
A helper function that returns the type of a load or store instruction.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
This struct is a compact representation of a valid (non-zero power of two) alignment.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
uint64_t getScalarSizeInBits() const
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
bool isFixedLengthVector() const
Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Type Conversion Cost Table.