29#include "llvm/IR/IntrinsicsAMDGPU.h"
40#define DEBUG_TYPE "amdgpu-codegenprepare"
48 "amdgpu-codegenprepare-widen-constant-loads",
49 cl::desc(
"Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
54 BreakLargePHIs(
"amdgpu-codegenprepare-break-large-phis",
55 cl::desc(
"Break large PHI nodes for DAGISel"),
59 ForceBreakLargePHIs(
"amdgpu-codegenprepare-force-break-large-phis",
60 cl::desc(
"For testing purposes, always break large "
61 "PHIs even if it isn't profitable."),
65 "amdgpu-codegenprepare-break-large-phis-threshold",
66 cl::desc(
"Minimum type size in bits for breaking large PHI nodes"),
70 "amdgpu-codegenprepare-mul24",
71 cl::desc(
"Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
77 "amdgpu-codegenprepare-expand-div64",
78 cl::desc(
"Expand 64-bit division in AMDGPUCodeGenPrepare"),
85 "amdgpu-codegenprepare-disable-idiv-expansion",
86 cl::desc(
"Prevent expanding integer division in AMDGPUCodeGenPrepare"),
92 "amdgpu-codegenprepare-disable-fdiv-expansion",
93 cl::desc(
"Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
97class AMDGPUCodeGenPrepareImpl
98 :
public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
107 const bool HasFP32DenormalFlush;
108 bool FlowChanged =
false;
109 mutable Function *SqrtF32 =
nullptr;
110 mutable Function *LdexpF32 =
nullptr;
119 DL(
F.getDataLayout()), SQ(
DL, TLI, DT, AC),
129 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
139 F.getParent(), Intrinsic::ldexp,
140 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
144 bool canBreakPHINode(
const PHINode &
I);
147 bool isLegalFloatingTy(
const Type *
T)
const;
156 bool canIgnoreDenormalInput(
const Value *V,
const Instruction *CtxI)
const {
157 return HasFP32DenormalFlush ||
182 unsigned MaxDivBits,
bool Signed)
const;
187 bool IsDiv,
bool IsSigned)
const;
191 bool IsDiv,
bool IsSigned)
const;
209 bool canWidenScalarExtLoad(
LoadInst &
I)
const;
223 float ReqdAccuracy)
const;
228 float ReqdAccuracy)
const;
230 std::pair<Value *, Value *> getFrexpResults(
IRBuilder<> &Builder,
234 bool IsNegative)
const;
241 bool IsNegative)
const;
245 void replaceWithMaskedWorkitemIdX(
Instruction &
I,
unsigned WaveSize)
const;
246 bool tryReplaceWithWorkitemId(
Instruction &
I,
unsigned Wave)
const;
279 if (!ExpandDiv64InIR)
283 StringRef getPassName()
const override {
return "AMDGPU IR optimizations"; }
288bool AMDGPUCodeGenPrepareImpl::run() {
289 BreakPhiNodesCache.clear();
290 bool MadeChange =
false;
302 while (!DeadVals.empty()) {
310bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(
const Type *Ty)
const {
312 (Ty->
isHalfTy() && ST.has16BitInsts());
315bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &
I)
const {
316 Type *Ty =
I.getType();
317 int TySize =
DL.getTypeSizeInBits(Ty);
318 Align Alignment =
DL.getValueOrABITypeAlignment(
I.getAlign(), Ty);
320 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.
isUniform(&
I);
324AMDGPUCodeGenPrepareImpl::numBitsUnsigned(
Value *
Op,
325 const Instruction *CtxI)
const {
330AMDGPUCodeGenPrepareImpl::numBitsSigned(
Value *
Op,
331 const Instruction *CtxI)
const {
343 for (
int I = 0,
E = VT->getNumElements();
I !=
E; ++
I)
344 Values.
push_back(Builder.CreateExtractElement(V,
I));
350 if (!Ty->isVectorTy()) {
356 for (
int I = 0,
E = Values.
size();
I !=
E; ++
I)
357 NewVal = Builder.CreateInsertElement(NewVal, Values[
I],
I);
362bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &
I)
const {
363 if (
I.getOpcode() != Instruction::Mul)
366 Type *Ty =
I.getType();
368 if (
Size <= 16 && ST.has16BitInsts())
378 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
380 unsigned LHSBits = 0, RHSBits = 0;
381 bool IsSigned =
false;
383 if (ST.
hasMulU24() && (LHSBits = numBitsUnsigned(
LHS, &
I)) <= 24 &&
384 (RHSBits = numBitsUnsigned(
RHS, &
I)) <= 24) {
387 }
else if (ST.
hasMulI24() && (LHSBits = numBitsSigned(
LHS, &
I)) <= 24 &&
388 (RHSBits = numBitsSigned(
RHS, &
I)) <= 24) {
394 SmallVector<Value *, 4> LHSVals;
395 SmallVector<Value *, 4> RHSVals;
396 SmallVector<Value *, 4> ResultVals;
400 IntegerType *I32Ty = Builder.getInt32Ty();
401 IntegerType *IntrinTy =
Size > 32 ? Builder.getInt64Ty() : I32Ty;
402 Type *DstTy = LHSVals[0]->getType();
404 for (
int I = 0,
E = LHSVals.
size();
I !=
E; ++
I) {
405 Value *
LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[
I], I32Ty)
406 : Builder.CreateZExtOrTrunc(LHSVals[
I], I32Ty);
407 Value *
RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[
I], I32Ty)
408 : Builder.CreateZExtOrTrunc(RHSVals[
I], I32Ty);
410 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
412 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
413 : Builder.CreateZExtOrTrunc(Result, DstTy);
419 I.replaceAllUsesWith(NewVal);
420 DeadVals.push_back(&
I);
440bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO)
const {
461 if (!CBO || !CT || !CF)
488 Builder.setFastMathFlags(FPOp->getFastMathFlags());
494 DeadVals.push_back(&BO);
496 DeadVals.push_back(CastOp);
497 DeadVals.push_back(Sel);
501std::pair<Value *, Value *>
502AMDGPUCodeGenPrepareImpl::getFrexpResults(
IRBuilder<> &Builder,
504 Type *Ty = Src->getType();
517 : Builder.CreateExtractValue(Frexp, {1});
518 return {FrexpMant, FrexpExp};
524 bool IsNegative)
const {
539 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
542 return Builder.
CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
548 FastMathFlags FMF)
const {
552 if (HasFP32DenormalFlush && ST.
hasFractBug() && !ST.hasFastFMAF32() &&
558 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder,
RHS);
563 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder,
LHS);
575 FastMathFlags FMF)
const {
576 Type *Ty = Src->getType();
580 Builder.
CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
583 Value *InputScaleFactor =
590 Value *OutputScaleFactor =
592 return Builder.
CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
603 Type *Ty = Src->getType();
607 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
608 Constant *One = ConstantFP::get(Ty, 1.0);
609 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
611 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
613 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
615 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
616 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
617 Value *OutputScaleFactor = Builder.CreateSelect(
618 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
620 return Builder.CreateFMul(Rsq, OutputScaleFactor);
626 FastMathFlags SqrtFMF,
627 FastMathFlags DivFMF,
628 const Instruction *CtxI,
629 bool IsNegative)
const {
651 bool MaybePosInf = !SqrtFMF.
noInfs() && !DivFMF.
noInfs();
652 bool MaybeZero = !DivFMF.
noInfs();
654 DenormalMode DenormMode;
661 if (Interested !=
fcNone) {
666 DenormMode =
F.getDenormalMode(
X->getType()->getFltSemantics());
672 if (MaybeZero || MaybePosInf) {
674 if (MaybePosInf && MaybeZero) {
675 if (DenormMode.
Input != DenormalMode::DenormalModeKind::Dynamic) {
690 }
else if (MaybeZero) {
703 Value *
E = Builder.
CreateFMA(NegXY0, Y0, ConstantFP::get(
X->getType(), 1.0));
708 ConstantFP::get(
X->getType(), 0.5));
710 return Builder.
CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);
713bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
714 FastMathFlags SqrtFMF)
const {
720Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
722 const FastMathFlags SqrtFMF,
const Instruction *CtxI)
const {
733 bool IsNegative =
false;
738 IRBuilder<>::FastMathFlagGuard Guard(Builder);
743 canIgnoreDenormalInput(Den, CtxI)) {
754 return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative);
768 Value *Den, FastMathFlags FMF,
769 const Instruction *CtxI)
const {
776 bool IsNegative =
false;
781 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
802 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
811 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
816 Value *Recip = emitRcpIEEE1ULP(Builder, Den,
false);
830Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
833 if (ReqdAccuracy < 2.5f)
839 bool NumIsOne =
false;
841 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
849 if (!HasFP32DenormalFlush && !NumIsOne)
852 return Builder.
CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});
855Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
857 FastMathFlags SqrtFMF,
Value *RsqOp,
const Instruction *FDivInst,
858 float ReqdDivAccuracy)
const {
861 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
869 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
877 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
881 return emitFrexpDiv(Builder, Num, Den, DivFMF);
899bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
900 if (DisableFDivExpand)
915 FastMathFlags SqrtFMF;
920 Value *RsqOp =
nullptr;
922 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
923 DenII->hasOneUse()) {
925 SqrtFMF = SqrtOp->getFastMathFlags();
926 if (canOptimizeWithRsq(DivFMF, SqrtFMF))
927 RsqOp = SqrtOp->getOperand(0);
931 if (!IsFloat && !RsqOp)
943 const bool AllowInaccurateRcp = DivFMF.
approxFunc();
944 if (!RsqOp && AllowInaccurateRcp)
948 if (IsFloat && ReqdAccuracy < 1.0f)
955 SmallVector<Value *, 4> NumVals;
956 SmallVector<Value *, 4> DenVals;
957 SmallVector<Value *, 4> RsqDenVals;
964 SmallVector<Value *, 4> ResultVals(NumVals.
size());
965 for (
int I = 0,
E = NumVals.
size();
I !=
E; ++
I) {
966 Value *NumElt = NumVals[
I];
967 Value *DenElt = DenVals[
I];
968 Value *RsqDenElt = RsqOp ? RsqDenVals[
I] :
nullptr;
971 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
980 NewEltInst->copyMetadata(FDiv);
983 ResultVals[
I] = NewElt;
991 DeadVals.push_back(&FDiv);
1002 Value *LHS_EXT64 = Builder.CreateZExt(
LHS, I64Ty);
1003 Value *RHS_EXT64 = Builder.CreateZExt(
RHS, I64Ty);
1004 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1005 Value *
Lo = Builder.CreateTrunc(MUL64, I32Ty);
1006 Value *
Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1007 Hi = Builder.CreateTrunc(
Hi, I32Ty);
1008 return std::pair(
Lo,
Hi);
1019unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &
I,
Value *Num,
1021 unsigned MaxDivBits,
1022 bool IsSigned)
const {
1029 unsigned DivBits = SSBits - RHSSignBits + 1;
1030 if (DivBits > MaxDivBits)
1035 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1036 DivBits = SSBits - SignBits + 1;
1046 unsigned DivBits = SSBits - RHSSignBits;
1047 if (DivBits > MaxDivBits)
1055 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1056 DivBits = SSBits - SignBits;
1063 BinaryOperator &
I,
Value *Num,
1064 Value *Den,
bool IsDiv,
1065 bool IsSigned)
const {
1066 unsigned DivBits = getDivNumBits(
I, Num, Den, 24, IsSigned);
1069 return expandDivRem24Impl(Builder,
I, Num, Den, DivBits, IsDiv, IsSigned);
1072Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1074 unsigned DivBits,
bool IsDiv,
bool IsSigned)
const {
1080 ConstantInt *One = Builder.
getInt32(1);
1120 auto FMAD = !ST.hasMadMacF32Insts()
1124 {FQNeg->
getType()}, {FQNeg, FB, FA}, FQ);
1152 if (DivBits != 0 && DivBits < 32) {
1155 int InRegBits = 32 - DivBits;
1157 Res = Builder.
CreateShl(Res, InRegBits);
1160 ConstantInt *TruncMask
1161 = Builder.
getInt32((UINT64_C(1) << DivBits) - 1);
1162 Res = Builder.
CreateAnd(Res, TruncMask);
1173bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &
I,
1179 if (
C->getType()->getScalarSizeInBits() <= 32)
1195 if (BinOpDen->getOpcode() == Instruction::Shl &&
1213 return Builder.CreateAShr(V, Builder.getInt32(31));
1220 assert(
Opc == Instruction::URem ||
Opc == Instruction::UDiv ||
1221 Opc == Instruction::SRem ||
Opc == Instruction::SDiv);
1227 if (divHasSpecialOptimization(
I,
X,
Y))
1230 bool IsDiv =
Opc == Instruction::UDiv ||
Opc == Instruction::SDiv;
1231 bool IsSigned =
Opc == Instruction::SRem ||
Opc == Instruction::SDiv;
1233 Type *Ty =
X->getType();
1247 if (
Value *Res = expandDivRem24(Builder,
I,
X,
Y, IsDiv, IsSigned)) {
1253 ConstantInt *One = Builder.
getInt32(1);
1255 Value *Sign =
nullptr;
1260 Sign = IsDiv ? Builder.
CreateXor(SignX, SignY) : SignX;
1341 BinaryOperator &
I,
Value *Num,
1343 if (!ExpandDiv64InIR && divHasSpecialOptimization(
I, Num, Den))
1348 bool IsDiv =
Opc == Instruction::SDiv ||
Opc == Instruction::UDiv;
1349 bool IsSigned =
Opc == Instruction::SDiv ||
Opc == Instruction::SRem;
1351 unsigned NumDivBits = getDivNumBits(
I, Num, Den, 32, IsSigned);
1352 if (NumDivBits > 32)
1355 Value *Narrowed =
nullptr;
1356 if (NumDivBits <= 24) {
1357 Narrowed = expandDivRem24Impl(Builder,
I, Num, Den, NumDivBits,
1359 }
else if (NumDivBits <= 32) {
1360 Narrowed = expandDivRem32(Builder,
I, Num, Den);
1371void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &
I)
const {
1374 if (
Opc == Instruction::UDiv ||
Opc == Instruction::SDiv) {
1379 if (
Opc == Instruction::URem ||
Opc == Instruction::SRem) {
1399bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *
I) {
1400 unsigned Opc =
I->getOpcode();
1401 Type *OldType =
I->getType();
1403 if (
Opc != Instruction::Add &&
Opc != Instruction::Mul)
1408 if (
Opc != Instruction::Add &&
Opc != Instruction::Mul)
1410 "Instruction::Mul.");
1414 MaxBitsNeeded = std::max<unsigned>(
bit_ceil(MaxBitsNeeded), 8);
1415 Type *NewType =
DL.getSmallestLegalIntType(
I->getContext(), MaxBitsNeeded);
1419 if (NewBit >= OrigBit)
1431 int NumOfNonConstOps = 2;
1434 NumOfNonConstOps = 1;
1444 if (NewCost >= OldCost)
1455 DeadVals.push_back(
I);
1459bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &
I) {
1460 if (foldBinOpIntoSelect(
I))
1463 if (UseMul24Intrin && replaceMulWithMul24(
I))
1465 if (tryNarrowMathIfNoOverflow(&
I))
1470 Type *Ty =
I.getType();
1471 Value *NewDiv =
nullptr;
1476 if ((
Opc == Instruction::URem ||
Opc == Instruction::UDiv ||
1477 Opc == Instruction::SRem ||
Opc == Instruction::SDiv) &&
1479 !DisableIDivExpand) {
1480 Value *Num =
I.getOperand(0);
1481 Value *Den =
I.getOperand(1);
1488 for (
unsigned N = 0,
E = VT->getNumElements();
N !=
E; ++
N) {
1493 if (ScalarSize <= 32) {
1494 NewElt = expandDivRem32(Builder,
I, NumEltN, DenEltN);
1500 NewElt = shrinkDivRem64(Builder,
I, NumEltN, DenEltN);
1514 NewEltI->copyIRFlags(&
I);
1519 if (ScalarSize <= 32)
1520 NewDiv = expandDivRem32(Builder,
I, Num, Den);
1522 NewDiv = shrinkDivRem64(Builder,
I, Num, Den);
1529 I.replaceAllUsesWith(NewDiv);
1530 DeadVals.push_back(&
I);
1535 if (ExpandDiv64InIR) {
1537 for (BinaryOperator *Div : Div64ToExpand) {
1538 expandDivRem64(*Div);
1547bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &
I) {
1553 canWidenScalarExtLoad(
I)) {
1563 if (
auto *
Range =
WidenLoad->getMetadata(LLVMContext::MD_range)) {
1564 ConstantInt *
Lower =
1567 if (
Lower->isNullValue()) {
1568 WidenLoad->setMetadata(LLVMContext::MD_range,
nullptr);
1576 WidenLoad->setMetadata(LLVMContext::MD_range,
1581 int TySize =
DL.getTypeSizeInBits(
I.getType());
1586 DeadVals.push_back(&
I);
1593bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &
I) {
1614 Value *Fract =
nullptr;
1615 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1616 CmpVal == matchFractPat(*IIFalse)) {
1618 Fract = applyFractPat(Builder, CmpVal);
1619 }
else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1620 CmpVal == matchFractPat(*IITrue)) {
1622 Fract = applyFractPat(Builder, CmpVal);
1627 I.replaceAllUsesWith(Fract);
1628 DeadVals.push_back(&
I);
1635 return IA && IB && IA->getParent() == IB->getParent();
1645 const Value *CurVal = V;
1648 BitVector EltsCovered(FVT->getNumElements());
1655 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1658 const auto *VecSrc = IE->getOperand(0);
1667 EltsCovered.
set(Idx->getZExtValue());
1670 if (EltsCovered.
all())
1697 const auto [It, Inserted] = SeenPHIs.
insert(&
I);
1701 for (
const Value *Inc :
I.incoming_values()) {
1706 for (
const User *U :
I.users()) {
1712bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(
const PHINode &
I) {
1714 if (
const auto It = BreakPhiNodesCache.find(&
I);
1715 It != BreakPhiNodesCache.end())
1724 SmallPtrSet<const PHINode *, 8> WorkList;
1730 for (
const PHINode *WLP : WorkList) {
1731 assert(BreakPhiNodesCache.count(WLP) == 0);
1746 const auto Threshold = (
alignTo(WorkList.size() * 2, 3) / 3);
1747 unsigned NumBreakablePHIs = 0;
1748 bool CanBreak =
false;
1749 for (
const PHINode *Cur : WorkList) {
1757 if (++NumBreakablePHIs >= Threshold) {
1764 for (
const PHINode *Cur : WorkList)
1765 BreakPhiNodesCache[Cur] = CanBreak;
1814 Value *&Res = SlicedVals[{BB, Inc}];
1820 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1826 Res =
B.CreateShuffleVector(Inc, Mask, NewValName);
1828 Res =
B.CreateExtractElement(Inc,
Idx, NewValName);
1837bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &
I) {
1853 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1856 if (!ForceBreakLargePHIs && !canBreakPHINode(
I))
1859 std::vector<VectorSlice> Slices;
1866 const unsigned EltSize =
DL.getTypeSizeInBits(EltTy);
1868 if (EltSize == 8 || EltSize == 16) {
1869 const unsigned SubVecSize = (32 / EltSize);
1871 for (
unsigned End =
alignDown(NumElts, SubVecSize); Idx < End;
1873 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1877 for (; Idx < NumElts; ++Idx)
1878 Slices.emplace_back(EltTy, Idx, 1);
1881 assert(Slices.size() > 1);
1887 B.SetCurrentDebugLocation(
I.getDebugLoc());
1889 unsigned IncNameSuffix = 0;
1890 for (VectorSlice &S : Slices) {
1893 B.SetInsertPoint(
I.getParent()->getFirstNonPHIIt());
1894 S.NewPHI =
B.CreatePHI(S.Ty,
I.getNumIncomingValues());
1896 for (
const auto &[Idx, BB] :
enumerate(
I.blocks())) {
1897 S.NewPHI->addIncoming(S.getSlicedVal(BB,
I.getIncomingValue(Idx),
1898 "largephi.extractslice" +
1899 std::to_string(IncNameSuffix++)),
1906 unsigned NameSuffix = 0;
1907 for (VectorSlice &S : Slices) {
1908 const auto ValName =
"largephi.insertslice" + std::to_string(NameSuffix++);
1910 Vec =
B.CreateInsertVector(FVT, Vec, S.NewPHI, S.Idx, ValName);
1912 Vec =
B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
1915 I.replaceAllUsesWith(Vec);
1916 DeadVals.push_back(&
I);
1939 Load && Load->hasMetadata(LLVMContext::MD_nonnull))
1958 assert(SrcPtrKB.getBitWidth() ==
DL.getPointerSizeInBits(AS));
1959 assert((NullVal == 0 || NullVal == -1) &&
1960 "don't know how to check for this null value!");
1961 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
1964bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &
I) {
1968 if (
I.getType()->isVectorTy())
1973 const unsigned SrcAS =
I.getSrcAddressSpace();
1974 const unsigned DstAS =
I.getDestAddressSpace();
1976 bool CanLower =
false;
1994 auto *Intrin =
B.CreateIntrinsic(
1995 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
1996 I.replaceAllUsesWith(Intrin);
1997 DeadVals.push_back(&
I);
2001bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &
I) {
2004 case Intrinsic::minnum:
2005 case Intrinsic::minimumnum:
2006 case Intrinsic::minimum:
2007 return visitFMinLike(
I);
2008 case Intrinsic::sqrt:
2009 return visitSqrt(
I);
2010 case Intrinsic::log:
2011 case Intrinsic::log10:
2013 case Intrinsic::log2:
2016 case Intrinsic::amdgcn_mbcnt_lo:
2017 return visitMbcntLo(
I);
2018 case Intrinsic::amdgcn_mbcnt_hi:
2019 return visitMbcntHi(
I);
2032Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &
I) {
2040 if (IID != Intrinsic::minnum && IID != Intrinsic::minimum &&
2041 IID != Intrinsic::minimumnum)
2044 Type *Ty =
I.getType();
2048 Value *Arg0 =
I.getArgOperand(0);
2049 Value *Arg1 =
I.getArgOperand(1);
2057 One.convert(
C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
2073 SmallVector<Value *, 4> FractVals;
2076 SmallVector<Value *, 4> ResultVals(FractVals.
size());
2079 for (
unsigned I = 0,
E = FractVals.
size();
I !=
E; ++
I) {
2087bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &
I) {
2088 Value *FractArg = matchFractPat(
I);
2098 FastMathFlags FMF =
I.getFastMathFlags();
2102 Value *Fract = applyFractPat(Builder, FractArg);
2104 I.replaceAllUsesWith(Fract);
2105 DeadVals.push_back(&
I);
2110bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2126 if (ReqdAccuracy < 1.0f)
2130 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2134 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2138 SmallVector<Value *, 4> SrcVals;
2141 SmallVector<Value *, 4> ResultVals(SrcVals.
size());
2142 for (
int I = 0,
E = SrcVals.
size();
I !=
E; ++
I) {
2144 ResultVals[
I] = Builder.
CreateCall(getSqrtF32(), SrcVals[
I]);
2146 ResultVals[
I] = emitSqrtIEEE2ULP(Builder, SrcVals[
I], SqrtFMF);
2152 DeadVals.push_back(&Sqrt);
2157bool AMDGPUCodeGenPrepareImpl::visitLog(FPMathOperator &Log,
2163 FastMathFlags FMF =
Log.getFastMathFlags();
2170 if (
Log.getFPAccuracy() < 1.80f)
2181 double Log2BaseInverted =
2188 Log.replaceAllUsesWith(
Mul);
2189 DeadVals.push_back(&Log);
2193bool AMDGPUCodeGenPrepare::runOnFunction(Function &
F) {
2194 if (skipFunction(
F))
2197 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2201 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2202 const TargetLibraryInfo *TLI =
2203 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
F);
2204 AssumptionCache *AC =
2205 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
2206 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2207 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() :
nullptr;
2209 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2210 return AMDGPUCodeGenPrepareImpl(
F, TM, TLI, AC, DT, UA).run();
2220 AMDGPUCodeGenPrepareImpl Impl(
F, ATM, TLI, AC, DT, UA);
2224 if (!Impl.FlowChanged)
2230 "AMDGPU IR optimizations",
false,
false)
2239 CallInst *Tid =
B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
2240 ST.makeLIDRangeMetadata(Tid);
2245void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &
I)
const {
2247 CallInst *Tid = createWorkitemIdX(
B);
2253void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(
2254 Instruction &
I,
unsigned WaveSize)
const {
2256 CallInst *Tid = createWorkitemIdX(
B);
2258 Value *AndInst =
B.CreateAnd(Tid, Mask);
2266bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &
I,
2267 unsigned Wave)
const {
2274 if (*MaybeX == Wave) {
2275 replaceWithWorkitemIdX(
I);
2282 replaceWithMaskedWorkitemIdX(
I, Wave);
2290bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &
I)
const {
2306bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &
I)
const {
2319 if (*MaybeX == Wave) {
2330 using namespace PatternMatch;
2338 return tryReplaceWithWorkitemId(
I, Wave);
2341char AMDGPUCodeGenPrepare::ID = 0;
2344 return new AMDGPUCodeGenPrepare();
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
unsigned getWavefrontSize() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
This class represents a conversion between pointers from one address space to another.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
BinaryOps getOpcode() const
bool all() const
all - Returns true if all bits are set.
Represents analyses that only rely on functions' control flow.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
static LLVM_ABI Constant * getInfinity(Type *Ty, bool Negative=false)
static LLVM_ABI Constant * getZero(Type *Ty, bool Negative=false)
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Utility class for floating point operations which can have information about relaxed accuracy require...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
void setFast(bool B=true)
bool allowReciprocal() const
void setNoNaNs(bool B=true)
bool allowContract() const
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
LLVM_ABI Value * createIsFPClass(Value *FPNum, unsigned Test)
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateFMA(Value *Factor1, Value *Factor2, Value *Summand, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fma intrinsic.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Value * CreateFCmpOEQ(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
LLVM_ABI const fltSemantics & getFltSemantics() const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
ap_match< APFloat > m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
FunctionAddr VTableAddr Value
LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
LLVM_ABI void ReplaceInstWithValue(BasicBlock::iterator &BI, Value *V)
Replace all uses of an instruction (specified by BI) with a value, then remove and delete the origina...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
auto dyn_cast_or_null(const Y &Val)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
auto reverse(ContainerTy &&C)
LLVM_ABI bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
FunctionPass * createAMDGPUCodeGenPreparePass()
To bit_cast(const From &from) noexcept
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool isKnownNeverNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
unsigned Log2(Align A)
Returns the log2 of the alignment.
LLVM_ABI bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return true if the given value is known to have exactly one bit set when defined.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
constexpr bool inputsAreZero() const
Return true if input denormals must be implicitly treated as 0.
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
bool isNegative() const
Returns true if this value is known to be negative.
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.
LLVM_ABI bool isKnownNeverLogicalZero(DenormalMode Mode) const
Return true if it's known this can never be interpreted as a zero.
bool isKnownNeverPosInfinity() const
Return true if it's known this can never be +infinity.
SimplifyQuery getWithInstruction(const Instruction *I) const