29#include "llvm/IR/IntrinsicsAMDGPU.h"
40#define DEBUG_TYPE "amdgpu-codegenprepare"
48 "amdgpu-codegenprepare-widen-constant-loads",
49 cl::desc(
"Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
54 BreakLargePHIs(
"amdgpu-codegenprepare-break-large-phis",
55 cl::desc(
"Break large PHI nodes for DAGISel"),
59 ForceBreakLargePHIs(
"amdgpu-codegenprepare-force-break-large-phis",
60 cl::desc(
"For testing purposes, always break large "
61 "PHIs even if it isn't profitable."),
65 "amdgpu-codegenprepare-break-large-phis-threshold",
66 cl::desc(
"Minimum type size in bits for breaking large PHI nodes"),
70 "amdgpu-codegenprepare-mul24",
71 cl::desc(
"Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
77 "amdgpu-codegenprepare-expand-div64",
78 cl::desc(
"Expand 64-bit division in AMDGPUCodeGenPrepare"),
85 "amdgpu-codegenprepare-disable-idiv-expansion",
86 cl::desc(
"Prevent expanding integer division in AMDGPUCodeGenPrepare"),
92 "amdgpu-codegenprepare-disable-fdiv-expansion",
93 cl::desc(
"Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
97class AMDGPUCodeGenPrepareImpl
98 :
public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
107 const bool HasFP32DenormalFlush;
108 bool FlowChanged =
false;
109 mutable Function *SqrtF32 =
nullptr;
110 mutable Function *LdexpF32 =
nullptr;
119 DL(
F.getDataLayout()), SQ(
DL, TLI, DT, AC),
129 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
139 F.getParent(), Intrinsic::ldexp,
140 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
144 bool canBreakPHINode(
const PHINode &
I);
147 bool isLegalFloatingTy(
const Type *
T)
const;
156 bool canIgnoreDenormalInput(
const Value *V,
const Instruction *CtxI)
const {
157 return HasFP32DenormalFlush ||
182 unsigned MaxDivBits,
bool Signed)
const;
187 bool IsDiv,
bool IsSigned)
const;
191 bool IsDiv,
bool IsSigned)
const;
209 bool canWidenScalarExtLoad(
LoadInst &
I)
const;
224 float ReqdAccuracy)
const;
229 float ReqdAccuracy)
const;
231 std::pair<Value *, Value *> getFrexpResults(
IRBuilder<> &Builder,
235 bool IsNegative)
const;
242 bool IsNegative)
const;
246 void replaceWithMaskedWorkitemIdX(
Instruction &
I,
unsigned WaveSize)
const;
247 bool tryReplaceWithWorkitemId(
Instruction &
I,
unsigned Wave)
const;
280 if (!ExpandDiv64InIR)
284 StringRef getPassName()
const override {
return "AMDGPU IR optimizations"; }
289bool AMDGPUCodeGenPrepareImpl::run() {
290 BreakPhiNodesCache.clear();
291 bool MadeChange =
false;
303 while (!DeadVals.empty()) {
311bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(
const Type *Ty)
const {
313 (Ty->
isHalfTy() && ST.has16BitInsts());
316bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &
I)
const {
317 Type *Ty =
I.getType();
318 int TySize =
DL.getTypeSizeInBits(Ty);
319 Align Alignment =
DL.getValueOrABITypeAlignment(
I.getAlign(), Ty);
321 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.
isUniform(&
I);
325AMDGPUCodeGenPrepareImpl::numBitsUnsigned(
Value *
Op,
326 const Instruction *CtxI)
const {
331AMDGPUCodeGenPrepareImpl::numBitsSigned(
Value *
Op,
332 const Instruction *CtxI)
const {
344 for (
int I = 0,
E = VT->getNumElements();
I !=
E; ++
I)
345 Values.
push_back(Builder.CreateExtractElement(V,
I));
351 if (!Ty->isVectorTy()) {
357 for (
int I = 0,
E = Values.
size();
I !=
E; ++
I)
358 NewVal = Builder.CreateInsertElement(NewVal, Values[
I],
I);
363bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &
I)
const {
364 if (
I.getOpcode() != Instruction::Mul)
367 Type *Ty =
I.getType();
369 if (
Size <= 16 && ST.has16BitInsts())
379 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
381 unsigned LHSBits = 0, RHSBits = 0;
382 bool IsSigned =
false;
384 if (ST.
hasMulU24() && (LHSBits = numBitsUnsigned(
LHS, &
I)) <= 24 &&
385 (RHSBits = numBitsUnsigned(
RHS, &
I)) <= 24) {
388 }
else if (ST.
hasMulI24() && (LHSBits = numBitsSigned(
LHS, &
I)) <= 24 &&
389 (RHSBits = numBitsSigned(
RHS, &
I)) <= 24) {
395 SmallVector<Value *, 4> LHSVals;
396 SmallVector<Value *, 4> RHSVals;
397 SmallVector<Value *, 4> ResultVals;
401 IntegerType *I32Ty = Builder.getInt32Ty();
402 IntegerType *IntrinTy =
Size > 32 ? Builder.getInt64Ty() : I32Ty;
403 Type *DstTy = LHSVals[0]->getType();
405 for (
int I = 0,
E = LHSVals.
size();
I !=
E; ++
I) {
406 Value *
LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[
I], I32Ty)
407 : Builder.CreateZExtOrTrunc(LHSVals[
I], I32Ty);
408 Value *
RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[
I], I32Ty)
409 : Builder.CreateZExtOrTrunc(RHSVals[
I], I32Ty);
411 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
413 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
414 : Builder.CreateZExtOrTrunc(Result, DstTy);
420 I.replaceAllUsesWith(NewVal);
421 DeadVals.push_back(&
I);
441bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO)
const {
462 if (!CBO || !CT || !CF)
489 Builder.setFastMathFlags(FPOp->getFastMathFlags());
495 DeadVals.push_back(&BO);
497 DeadVals.push_back(CastOp);
498 DeadVals.push_back(Sel);
502std::pair<Value *, Value *>
503AMDGPUCodeGenPrepareImpl::getFrexpResults(
IRBuilder<> &Builder,
505 Type *Ty = Src->getType();
518 : Builder.CreateExtractValue(Frexp, {1});
519 return {FrexpMant, FrexpExp};
525 bool IsNegative)
const {
540 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
543 return Builder.
CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
549 FastMathFlags FMF)
const {
553 if (HasFP32DenormalFlush && ST.
hasFractBug() && !ST.hasFastFMAF32() &&
559 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder,
RHS);
564 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder,
LHS);
576 FastMathFlags FMF)
const {
577 Type *Ty = Src->getType();
581 Builder.
CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
584 Value *InputScaleFactor =
591 Value *OutputScaleFactor =
593 return Builder.
CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
604 Type *Ty = Src->getType();
608 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
609 Constant *One = ConstantFP::get(Ty, 1.0);
610 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
612 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
614 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
616 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
617 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
618 Value *OutputScaleFactor = Builder.CreateSelect(
619 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
621 return Builder.CreateFMul(Rsq, OutputScaleFactor);
627 FastMathFlags SqrtFMF,
628 FastMathFlags DivFMF,
629 const Instruction *CtxI,
630 bool IsNegative)
const {
652 bool MaybePosInf = !SqrtFMF.
noInfs() && !DivFMF.
noInfs();
653 bool MaybeZero = !DivFMF.
noInfs();
655 DenormalMode DenormMode;
662 if (Interested !=
fcNone) {
667 DenormMode =
F.getDenormalMode(
X->getType()->getFltSemantics());
673 if (MaybeZero || MaybePosInf) {
675 if (MaybePosInf && MaybeZero) {
676 if (DenormMode.
Input != DenormalMode::DenormalModeKind::Dynamic) {
691 }
else if (MaybeZero) {
704 Value *
E = Builder.
CreateFMA(NegXY0, Y0, ConstantFP::get(
X->getType(), 1.0));
709 ConstantFP::get(
X->getType(), 0.5));
711 return Builder.
CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);
714bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
715 FastMathFlags SqrtFMF)
const {
721Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
723 const FastMathFlags SqrtFMF,
const Instruction *CtxI)
const {
734 bool IsNegative =
false;
739 IRBuilder<>::FastMathFlagGuard Guard(Builder);
744 canIgnoreDenormalInput(Den, CtxI)) {
755 return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative);
769 Value *Den, FastMathFlags FMF,
770 const Instruction *CtxI)
const {
777 bool IsNegative =
false;
782 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
803 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
812 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
817 Value *Recip = emitRcpIEEE1ULP(Builder, Den,
false);
831Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
834 if (ReqdAccuracy < 2.5f)
840 bool NumIsOne =
false;
842 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
850 if (!HasFP32DenormalFlush && !NumIsOne)
853 return Builder.
CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});
856Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
858 FastMathFlags SqrtFMF,
Value *RsqOp,
const Instruction *FDivInst,
859 float ReqdDivAccuracy)
const {
862 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
870 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
878 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
882 return emitFrexpDiv(Builder, Num, Den, DivFMF);
900bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
901 if (DisableFDivExpand)
916 FastMathFlags SqrtFMF;
921 Value *RsqOp =
nullptr;
923 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
924 DenII->hasOneUse()) {
926 SqrtFMF = SqrtOp->getFastMathFlags();
927 if (canOptimizeWithRsq(DivFMF, SqrtFMF))
928 RsqOp = SqrtOp->getOperand(0);
932 if (!IsFloat && !RsqOp)
944 const bool AllowInaccurateRcp = DivFMF.
approxFunc();
945 if (!RsqOp && AllowInaccurateRcp)
949 if (IsFloat && ReqdAccuracy < 1.0f)
956 SmallVector<Value *, 4> NumVals;
957 SmallVector<Value *, 4> DenVals;
958 SmallVector<Value *, 4> RsqDenVals;
965 SmallVector<Value *, 4> ResultVals(NumVals.
size());
966 for (
int I = 0,
E = NumVals.
size();
I !=
E; ++
I) {
967 Value *NumElt = NumVals[
I];
968 Value *DenElt = DenVals[
I];
969 Value *RsqDenElt = RsqOp ? RsqDenVals[
I] :
nullptr;
972 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
981 NewEltInst->copyMetadata(FDiv);
984 ResultVals[
I] = NewElt;
992 DeadVals.push_back(&FDiv);
1003 Value *LHS_EXT64 = Builder.CreateZExt(
LHS, I64Ty);
1004 Value *RHS_EXT64 = Builder.CreateZExt(
RHS, I64Ty);
1005 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1006 Value *
Lo = Builder.CreateTrunc(MUL64, I32Ty);
1007 Value *
Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1008 Hi = Builder.CreateTrunc(
Hi, I32Ty);
1009 return std::pair(
Lo,
Hi);
1020unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &
I,
Value *Num,
1022 unsigned MaxDivBits,
1023 bool IsSigned)
const {
1030 unsigned DivBits = SSBits - RHSSignBits + 1;
1031 if (DivBits > MaxDivBits)
1036 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1037 DivBits = SSBits - SignBits + 1;
1047 unsigned DivBits = SSBits - RHSSignBits;
1048 if (DivBits > MaxDivBits)
1056 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1057 DivBits = SSBits - SignBits;
1064 BinaryOperator &
I,
Value *Num,
1065 Value *Den,
bool IsDiv,
1066 bool IsSigned)
const {
1067 unsigned DivBits = getDivNumBits(
I, Num, Den, 24, IsSigned);
1070 return expandDivRem24Impl(Builder,
I, Num, Den, DivBits, IsDiv, IsSigned);
1073Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1075 unsigned DivBits,
bool IsDiv,
bool IsSigned)
const {
1081 ConstantInt *One = Builder.
getInt32(1);
1121 auto FMAD = !ST.hasMadMacF32Insts()
1125 {FQNeg->
getType()}, {FQNeg, FB, FA}, FQ);
1153 if (DivBits != 0 && DivBits < 32) {
1156 int InRegBits = 32 - DivBits;
1158 Res = Builder.
CreateShl(Res, InRegBits);
1161 ConstantInt *TruncMask
1162 = Builder.
getInt32((UINT64_C(1) << DivBits) - 1);
1163 Res = Builder.
CreateAnd(Res, TruncMask);
1174bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &
I,
1180 if (
C->getType()->getScalarSizeInBits() <= 32)
1196 if (BinOpDen->getOpcode() == Instruction::Shl &&
1214 return Builder.CreateAShr(V, Builder.getInt32(31));
1221 assert(
Opc == Instruction::URem ||
Opc == Instruction::UDiv ||
1222 Opc == Instruction::SRem ||
Opc == Instruction::SDiv);
1228 if (divHasSpecialOptimization(
I,
X,
Y))
1231 bool IsDiv =
Opc == Instruction::UDiv ||
Opc == Instruction::SDiv;
1232 bool IsSigned =
Opc == Instruction::SRem ||
Opc == Instruction::SDiv;
1234 Type *Ty =
X->getType();
1248 if (
Value *Res = expandDivRem24(Builder,
I,
X,
Y, IsDiv, IsSigned)) {
1254 ConstantInt *One = Builder.
getInt32(1);
1256 Value *Sign =
nullptr;
1261 Sign = IsDiv ? Builder.
CreateXor(SignX, SignY) : SignX;
1342 BinaryOperator &
I,
Value *Num,
1344 if (!ExpandDiv64InIR && divHasSpecialOptimization(
I, Num, Den))
1349 bool IsDiv =
Opc == Instruction::SDiv ||
Opc == Instruction::UDiv;
1350 bool IsSigned =
Opc == Instruction::SDiv ||
Opc == Instruction::SRem;
1352 unsigned NumDivBits = getDivNumBits(
I, Num, Den, 32, IsSigned);
1353 if (NumDivBits > 32)
1356 Value *Narrowed =
nullptr;
1357 if (NumDivBits <= 24) {
1358 Narrowed = expandDivRem24Impl(Builder,
I, Num, Den, NumDivBits,
1360 }
else if (NumDivBits <= 32) {
1361 Narrowed = expandDivRem32(Builder,
I, Num, Den);
1372void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &
I)
const {
1375 if (
Opc == Instruction::UDiv ||
Opc == Instruction::SDiv) {
1380 if (
Opc == Instruction::URem ||
Opc == Instruction::SRem) {
1400bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *
I) {
1401 unsigned Opc =
I->getOpcode();
1402 Type *OldType =
I->getType();
1404 if (
Opc != Instruction::Add &&
Opc != Instruction::Mul)
1409 if (
Opc != Instruction::Add &&
Opc != Instruction::Mul)
1411 "Instruction::Mul.");
1415 MaxBitsNeeded = std::max<unsigned>(
bit_ceil(MaxBitsNeeded), 8);
1416 Type *NewType =
DL.getSmallestLegalIntType(
I->getContext(), MaxBitsNeeded);
1420 if (NewBit >= OrigBit)
1432 int NumOfNonConstOps = 2;
1435 NumOfNonConstOps = 1;
1445 if (NewCost >= OldCost)
1456 DeadVals.push_back(
I);
1460bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &
I) {
1461 if (foldBinOpIntoSelect(
I))
1464 if (UseMul24Intrin && replaceMulWithMul24(
I))
1466 if (tryNarrowMathIfNoOverflow(&
I))
1471 Type *Ty =
I.getType();
1472 Value *NewDiv =
nullptr;
1477 if ((
Opc == Instruction::URem ||
Opc == Instruction::UDiv ||
1478 Opc == Instruction::SRem ||
Opc == Instruction::SDiv) &&
1480 !DisableIDivExpand) {
1481 Value *Num =
I.getOperand(0);
1482 Value *Den =
I.getOperand(1);
1489 for (
unsigned N = 0,
E = VT->getNumElements();
N !=
E; ++
N) {
1494 if (ScalarSize <= 32) {
1495 NewElt = expandDivRem32(Builder,
I, NumEltN, DenEltN);
1501 NewElt = shrinkDivRem64(Builder,
I, NumEltN, DenEltN);
1515 NewEltI->copyIRFlags(&
I);
1520 if (ScalarSize <= 32)
1521 NewDiv = expandDivRem32(Builder,
I, Num, Den);
1523 NewDiv = shrinkDivRem64(Builder,
I, Num, Den);
1530 I.replaceAllUsesWith(NewDiv);
1531 DeadVals.push_back(&
I);
1536 if (ExpandDiv64InIR) {
1538 for (BinaryOperator *Div : Div64ToExpand) {
1539 expandDivRem64(*Div);
1548bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &
I) {
1554 canWidenScalarExtLoad(
I)) {
1564 if (
auto *
Range =
WidenLoad->getMetadata(LLVMContext::MD_range)) {
1565 ConstantInt *
Lower =
1568 if (
Lower->isNullValue()) {
1569 WidenLoad->setMetadata(LLVMContext::MD_range,
nullptr);
1577 WidenLoad->setMetadata(LLVMContext::MD_range,
1582 int TySize =
DL.getTypeSizeInBits(
I.getType());
1587 DeadVals.push_back(&
I);
1594bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &
I) {
1600 Value *Fract =
nullptr;
1609 Value *FractSrc = matchFractPatImpl(*
X, *
C);
1614 Fract = applyFractPat(Builder, FractSrc);
1624 CmpPredicate IsNanPred;
1633 if (IsNanPred == FCmpInst::FCMP_UNO && TrueVal == CmpVal &&
1634 CmpVal == matchFractPatNanAvoidant(*FalseVal)) {
1636 Fract = applyFractPat(Builder, CmpVal);
1637 }
else if (IsNanPred == FCmpInst::FCMP_ORD && FalseVal == CmpVal) {
1638 if (CmpVal == matchFractPatNanAvoidant(*TrueVal)) {
1640 Fract = applyFractPat(Builder, CmpVal);
1644 CmpPredicate PredInf;
1650 PredInf != FCmpInst::FCMP_UNE ||
1651 CmpVal != matchFractPatNanAvoidant(*IfNotInf))
1661 Value *NewFract = applyFractPat(Builder, CmpVal);
1665 DeadVals.push_back(ClampInfSelect->
getOperand(1));
1669 Fract = ClampInfSelect;
1676 I.replaceAllUsesWith(Fract);
1677 DeadVals.push_back(&
I);
1684 return IA && IB && IA->getParent() == IB->getParent();
1694 const Value *CurVal = V;
1697 BitVector EltsCovered(FVT->getNumElements());
1704 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1707 const auto *VecSrc = IE->getOperand(0);
1716 EltsCovered.
set(Idx->getZExtValue());
1719 if (EltsCovered.
all())
1746 const auto [It, Inserted] = SeenPHIs.
insert(&
I);
1750 for (
const Value *Inc :
I.incoming_values()) {
1755 for (
const User *U :
I.users()) {
1761bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(
const PHINode &
I) {
1763 if (
const auto It = BreakPhiNodesCache.find(&
I);
1764 It != BreakPhiNodesCache.end())
1773 SmallPtrSet<const PHINode *, 8> WorkList;
1779 for (
const PHINode *WLP : WorkList) {
1780 assert(BreakPhiNodesCache.count(WLP) == 0);
1795 const auto Threshold = (
alignTo(WorkList.size() * 2, 3) / 3);
1796 unsigned NumBreakablePHIs = 0;
1797 bool CanBreak =
false;
1798 for (
const PHINode *Cur : WorkList) {
1806 if (++NumBreakablePHIs >= Threshold) {
1813 for (
const PHINode *Cur : WorkList)
1814 BreakPhiNodesCache[Cur] = CanBreak;
1863 Value *&Res = SlicedVals[{BB, Inc}];
1869 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1875 Res =
B.CreateShuffleVector(Inc, Mask, NewValName);
1877 Res =
B.CreateExtractElement(Inc,
Idx, NewValName);
1886bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &
I) {
1902 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1905 if (!ForceBreakLargePHIs && !canBreakPHINode(
I))
1908 std::vector<VectorSlice> Slices;
1915 const unsigned EltSize =
DL.getTypeSizeInBits(EltTy);
1917 if (EltSize == 8 || EltSize == 16) {
1918 const unsigned SubVecSize = (32 / EltSize);
1920 for (
unsigned End =
alignDown(NumElts, SubVecSize); Idx < End;
1922 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1926 for (; Idx < NumElts; ++Idx)
1927 Slices.emplace_back(EltTy, Idx, 1);
1930 assert(Slices.size() > 1);
1936 B.SetCurrentDebugLocation(
I.getDebugLoc());
1938 unsigned IncNameSuffix = 0;
1939 for (VectorSlice &S : Slices) {
1942 B.SetInsertPoint(
I.getParent()->getFirstNonPHIIt());
1943 S.NewPHI =
B.CreatePHI(S.Ty,
I.getNumIncomingValues());
1945 for (
const auto &[Idx, BB] :
enumerate(
I.blocks())) {
1946 S.NewPHI->addIncoming(S.getSlicedVal(BB,
I.getIncomingValue(Idx),
1947 "largephi.extractslice" +
1948 std::to_string(IncNameSuffix++)),
1955 unsigned NameSuffix = 0;
1956 for (VectorSlice &S : Slices) {
1957 const auto ValName =
"largephi.insertslice" + std::to_string(NameSuffix++);
1959 Vec =
B.CreateInsertVector(FVT, Vec, S.NewPHI, S.Idx, ValName);
1961 Vec =
B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
1964 I.replaceAllUsesWith(Vec);
1965 DeadVals.push_back(&
I);
1988 Load && Load->hasMetadata(LLVMContext::MD_nonnull))
2007 assert(SrcPtrKB.getBitWidth() ==
DL.getPointerSizeInBits(AS));
2008 assert((NullVal == 0 || NullVal == -1) &&
2009 "don't know how to check for this null value!");
2010 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2013bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &
I) {
2017 if (
I.getType()->isVectorTy())
2022 const unsigned SrcAS =
I.getSrcAddressSpace();
2023 const unsigned DstAS =
I.getDestAddressSpace();
2025 bool CanLower =
false;
2043 auto *Intrin =
B.CreateIntrinsic(
2044 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2045 I.replaceAllUsesWith(Intrin);
2046 DeadVals.push_back(&
I);
2050bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &
I) {
2053 case Intrinsic::minnum:
2054 case Intrinsic::minimumnum:
2055 case Intrinsic::minimum:
2056 return visitFMinLike(
I);
2057 case Intrinsic::sqrt:
2058 return visitSqrt(
I);
2059 case Intrinsic::log:
2060 case Intrinsic::log10:
2062 case Intrinsic::log2:
2065 case Intrinsic::amdgcn_mbcnt_lo:
2066 return visitMbcntLo(
I);
2067 case Intrinsic::amdgcn_mbcnt_hi:
2068 return visitMbcntHi(
I);
2076Value *AMDGPUCodeGenPrepareImpl::matchFractPatImpl(
Value &FractSrc,
2077 const APFloat &
C)
const {
2086 OneNextDown.
next(
true);
2089 if (OneNextDown !=
C)
2109Value *AMDGPUCodeGenPrepareImpl::matchFractPatNanAvoidant(
Value &V) {
2121 return matchFractPatImpl(*Arg0, *
C);
2126 SmallVector<Value *, 4> FractVals;
2129 SmallVector<Value *, 4> ResultVals(FractVals.
size());
2132 for (
unsigned I = 0,
E = FractVals.
size();
I !=
E; ++
I) {
2140bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &
I) {
2148 FractArg = matchFractPatImpl(*
X, *
C);
2153 FractArg = matchFractPatNanAvoidant(
I);
2165 FastMathFlags FMF =
I.getFastMathFlags();
2169 Value *Fract = applyFractPat(Builder, FractArg);
2171 I.replaceAllUsesWith(Fract);
2172 DeadVals.push_back(&
I);
2177bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2193 if (ReqdAccuracy < 1.0f)
2197 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2201 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2205 SmallVector<Value *, 4> SrcVals;
2208 SmallVector<Value *, 4> ResultVals(SrcVals.
size());
2209 for (
int I = 0,
E = SrcVals.
size();
I !=
E; ++
I) {
2211 ResultVals[
I] = Builder.
CreateCall(getSqrtF32(), SrcVals[
I]);
2213 ResultVals[
I] = emitSqrtIEEE2ULP(Builder, SrcVals[
I], SqrtFMF);
2219 DeadVals.push_back(&Sqrt);
2224bool AMDGPUCodeGenPrepareImpl::visitLog(FPMathOperator &Log,
2230 FastMathFlags FMF =
Log.getFastMathFlags();
2237 if (
Log.getFPAccuracy() < 1.80f)
2248 double Log2BaseInverted =
2255 Log.replaceAllUsesWith(
Mul);
2256 DeadVals.push_back(&Log);
2260bool AMDGPUCodeGenPrepare::runOnFunction(Function &
F) {
2261 if (skipFunction(
F))
2264 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2268 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2269 const TargetLibraryInfo *TLI =
2270 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
F);
2271 AssumptionCache *AC =
2272 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
2273 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2274 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() :
nullptr;
2276 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2277 return AMDGPUCodeGenPrepareImpl(
F, TM, TLI, AC, DT, UA).run();
2287 AMDGPUCodeGenPrepareImpl Impl(
F, ATM, TLI, AC, DT, UA);
2291 if (!Impl.FlowChanged)
2297 "AMDGPU IR optimizations",
false,
false)
2306 CallInst *Tid =
B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
2307 ST.makeLIDRangeMetadata(Tid);
2312void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &
I)
const {
2314 CallInst *Tid = createWorkitemIdX(
B);
2320void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(
2321 Instruction &
I,
unsigned WaveSize)
const {
2323 CallInst *Tid = createWorkitemIdX(
B);
2325 Value *AndInst =
B.CreateAnd(Tid, Mask);
2333bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &
I,
2334 unsigned Wave)
const {
2341 if (*MaybeX == Wave) {
2342 replaceWithWorkitemIdX(
I);
2349 replaceWithMaskedWorkitemIdX(
I, Wave);
2357bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &
I)
const {
2373bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &
I)
const {
2386 if (*MaybeX == Wave) {
2397 using namespace PatternMatch;
2405 return tryReplaceWithWorkitemId(
I, Wave);
2408char AMDGPUCodeGenPrepare::ID = 0;
2411 return new AMDGPUCodeGenPrepare();
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
static void visit(BasicBlock &Start, std::function< bool(BasicBlock *)> op)
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
unsigned getWavefrontSize() const
static APFloat getOne(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative One.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
opStatus next(bool nextDown)
This class represents a conversion between pointers from one address space to another.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
BinaryOps getOpcode() const
bool all() const
all - Returns true if all bits are set.
Represents analyses that only rely on functions' control flow.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
static LLVM_ABI Constant * getInfinity(Type *Ty, bool Negative=false)
static LLVM_ABI Constant * getZero(Type *Ty, bool Negative=false)
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Utility class for floating point operations which can have information about relaxed accuracy require...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
void setFast(bool B=true)
bool allowReciprocal() const
void setNoNaNs(bool B=true)
bool allowContract() const
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
LLVM_ABI Value * createIsFPClass(Value *FPNum, unsigned Test)
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateFMA(Value *Factor1, Value *Factor2, Value *Summand, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fma intrinsic.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Value * CreateFCmpOEQ(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
LLVM_ABI const fltSemantics & getFltSemantics() const
void setOperand(unsigned i, Value *Val)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
MaxMin_match< FCmpInst, LHS, RHS, ufmin_pred_ty > m_UnordFMin(const LHS &L, const RHS &R)
Match an 'unordered' floating point minimum function.
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< typename m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty, typename m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty > m_FMinNum_or_FMinimumNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
ap_match< APFloat > m_APFloatAllowPoison(const APFloat *&Res)
Match APFloat while allowing poison in splat vector constants.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
auto m_Value()
Match an arbitrary value and ignore it.
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
cstfp_pred_ty< is_signed_inf< false > > m_PosInf()
Match a positive infinity FP constant.
cstfp_pred_ty< is_pos_zero_fp > m_PosZeroFP()
Match a floating-point positive zero.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
FunctionAddr VTableAddr Value
LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
LLVM_ABI void ReplaceInstWithValue(BasicBlock::iterator &BI, Value *V)
Replace all uses of an instruction (specified by BI) with a value, then remove and delete the origina...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
auto dyn_cast_or_null(const Y &Val)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
auto reverse(ContainerTy &&C)
LLVM_ABI bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
FunctionPass * createAMDGPUCodeGenPreparePass()
To bit_cast(const From &from) noexcept
DWARFExpression::Operation Op
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool isKnownNeverNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
unsigned Log2(Align A)
Returns the log2 of the alignment.
LLVM_ABI bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return true if the given value is known to have exactly one bit set when defined.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
constexpr bool inputsAreZero() const
Return true if input denormals must be implicitly treated as 0.
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
bool isNegative() const
Returns true if this value is known to be negative.
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.
LLVM_ABI bool isKnownNeverLogicalZero(DenormalMode Mode) const
Return true if it's known this can never be interpreted as a zero.
bool isKnownNeverPosInfinity() const
Return true if it's known this can never be +infinity.
SimplifyQuery getWithInstruction(const Instruction *I) const