29#include "llvm/IR/IntrinsicsAMDGPU.h"
40#define DEBUG_TYPE "amdgpu-codegenprepare"
48 "amdgpu-codegenprepare-widen-constant-loads",
49 cl::desc(
"Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
54 BreakLargePHIs(
"amdgpu-codegenprepare-break-large-phis",
55 cl::desc(
"Break large PHI nodes for DAGISel"),
59 ForceBreakLargePHIs(
"amdgpu-codegenprepare-force-break-large-phis",
60 cl::desc(
"For testing purposes, always break large "
61 "PHIs even if it isn't profitable."),
65 "amdgpu-codegenprepare-break-large-phis-threshold",
66 cl::desc(
"Minimum type size in bits for breaking large PHI nodes"),
70 "amdgpu-codegenprepare-mul24",
71 cl::desc(
"Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
77 "amdgpu-codegenprepare-expand-div64",
78 cl::desc(
"Expand 64-bit division in AMDGPUCodeGenPrepare"),
85 "amdgpu-codegenprepare-disable-idiv-expansion",
86 cl::desc(
"Prevent expanding integer division in AMDGPUCodeGenPrepare"),
92 "amdgpu-codegenprepare-disable-fdiv-expansion",
93 cl::desc(
"Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
97class AMDGPUCodeGenPrepareImpl
98 :
public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
108 const bool HasFP32DenormalFlush;
109 bool FlowChanged =
false;
110 mutable Function *SqrtF32 =
nullptr;
111 mutable Function *LdexpF32 =
nullptr;
120 DT(DT), UA(UA),
DL(
F.getDataLayout()),
130 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
140 F.getParent(), Intrinsic::ldexp,
141 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
145 bool canBreakPHINode(
const PHINode &
I);
148 bool isLegalFloatingTy(
const Type *
T)
const;
156 bool canIgnoreDenormalInput(
const Value *V,
const Instruction *CtxI)
const {
157 return HasFP32DenormalFlush ||
164 unsigned numBitsUnsigned(
Value *
Op)
const;
169 unsigned numBitsSigned(
Value *
Op)
const;
182 unsigned MaxDivBits,
bool Signed)
const;
187 bool IsDiv,
bool IsSigned)
const;
191 bool IsDiv,
bool IsSigned)
const;
209 bool canWidenScalarExtLoad(
LoadInst &
I)
const;
223 float ReqdAccuracy)
const;
228 float ReqdAccuracy)
const;
230 std::pair<Value *, Value *> getFrexpResults(
IRBuilder<> &Builder,
234 bool IsNegative)
const;
241 bool IsNegative)
const;
245 void replaceWithMaskedWorkitemIdX(
Instruction &
I,
unsigned WaveSize)
const;
246 bool tryReplaceWithWorkitemId(
Instruction &
I,
unsigned Wave)
const;
278 if (!ExpandDiv64InIR)
282 StringRef getPassName()
const override {
return "AMDGPU IR optimizations"; }
287bool AMDGPUCodeGenPrepareImpl::run() {
288 BreakPhiNodesCache.clear();
289 bool MadeChange =
false;
301 while (!DeadVals.empty()) {
309bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(
const Type *Ty)
const {
314bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &
I)
const {
315 Type *Ty =
I.getType();
316 int TySize =
DL.getTypeSizeInBits(Ty);
317 Align Alignment =
DL.getValueOrABITypeAlignment(
I.getAlign(), Ty);
319 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.
isUniform(&
I);
322unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(
Value *
Op)
const {
326unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(
Value *
Op)
const {
338 for (
int I = 0,
E = VT->getNumElements();
I !=
E; ++
I)
339 Values.
push_back(Builder.CreateExtractElement(V,
I));
345 if (!Ty->isVectorTy()) {
351 for (
int I = 0,
E = Values.
size();
I !=
E; ++
I)
352 NewVal = Builder.CreateInsertElement(NewVal, Values[
I],
I);
357bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &
I)
const {
358 if (
I.getOpcode() != Instruction::Mul)
361 Type *Ty =
I.getType();
373 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
375 unsigned LHSBits = 0, RHSBits = 0;
376 bool IsSigned =
false;
378 if (ST.
hasMulU24() && (LHSBits = numBitsUnsigned(
LHS)) <= 24 &&
379 (RHSBits = numBitsUnsigned(
RHS)) <= 24) {
382 }
else if (ST.
hasMulI24() && (LHSBits = numBitsSigned(
LHS)) <= 24 &&
383 (RHSBits = numBitsSigned(
RHS)) <= 24) {
389 SmallVector<Value *, 4> LHSVals;
390 SmallVector<Value *, 4> RHSVals;
391 SmallVector<Value *, 4> ResultVals;
395 IntegerType *I32Ty = Builder.getInt32Ty();
396 IntegerType *IntrinTy =
Size > 32 ? Builder.getInt64Ty() : I32Ty;
397 Type *DstTy = LHSVals[0]->getType();
399 for (
int I = 0,
E = LHSVals.
size();
I !=
E; ++
I) {
400 Value *
LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[
I], I32Ty)
401 : Builder.CreateZExtOrTrunc(LHSVals[
I], I32Ty);
402 Value *
RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[
I], I32Ty)
403 : Builder.CreateZExtOrTrunc(RHSVals[
I], I32Ty);
405 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
407 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
408 : Builder.CreateZExtOrTrunc(Result, DstTy);
414 I.replaceAllUsesWith(NewVal);
415 DeadVals.push_back(&
I);
435bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO)
const {
456 if (!CBO || !CT || !CF)
483 Builder.setFastMathFlags(FPOp->getFastMathFlags());
489 DeadVals.push_back(&BO);
491 DeadVals.push_back(CastOp);
492 DeadVals.push_back(Sel);
496std::pair<Value *, Value *>
497AMDGPUCodeGenPrepareImpl::getFrexpResults(
IRBuilder<> &Builder,
499 Type *Ty = Src->getType();
512 : Builder.CreateExtractValue(Frexp, {1});
513 return {FrexpMant, FrexpExp};
519 bool IsNegative)
const {
534 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
537 return Builder.
CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
543 FastMathFlags FMF)
const {
553 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder,
RHS);
558 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder,
LHS);
570 FastMathFlags FMF)
const {
571 Type *Ty = Src->getType();
575 Builder.
CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
578 Value *InputScaleFactor =
585 Value *OutputScaleFactor =
587 return Builder.
CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
598 Type *Ty = Src->getType();
602 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
603 Constant *One = ConstantFP::get(Ty, 1.0);
604 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
606 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
608 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
610 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
611 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
612 Value *OutputScaleFactor = Builder.CreateSelect(
613 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
615 return Builder.CreateFMul(Rsq, OutputScaleFactor);
621 FastMathFlags SqrtFMF,
622 FastMathFlags DivFMF,
623 const Instruction *CtxI,
624 bool IsNegative)
const {
646 bool MaybePosInf = !SqrtFMF.
noInfs() && !DivFMF.
noInfs();
647 bool MaybeZero = !DivFMF.
noInfs();
649 DenormalMode DenormMode;
656 if (Interested !=
fcNone) {
661 DenormMode =
F.getDenormalMode(
X->getType()->getFltSemantics());
667 if (MaybeZero || MaybePosInf) {
669 if (MaybePosInf && MaybeZero) {
670 if (DenormMode.
Input != DenormalMode::DenormalModeKind::Dynamic) {
685 }
else if (MaybeZero) {
698 Value *
E = Builder.
CreateFMA(NegXY0, Y0, ConstantFP::get(
X->getType(), 1.0));
703 ConstantFP::get(
X->getType(), 0.5));
705 return Builder.
CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);
708bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
709 FastMathFlags SqrtFMF)
const {
715Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
717 const FastMathFlags SqrtFMF,
const Instruction *CtxI)
const {
728 bool IsNegative =
false;
733 IRBuilder<>::FastMathFlagGuard Guard(Builder);
738 canIgnoreDenormalInput(Den, CtxI)) {
749 return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative);
763 Value *Den, FastMathFlags FMF,
764 const Instruction *CtxI)
const {
771 bool IsNegative =
false;
776 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
797 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
806 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
811 Value *Recip = emitRcpIEEE1ULP(Builder, Den,
false);
825Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
828 if (ReqdAccuracy < 2.5f)
834 bool NumIsOne =
false;
836 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
844 if (!HasFP32DenormalFlush && !NumIsOne)
847 return Builder.
CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});
850Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
852 FastMathFlags SqrtFMF,
Value *RsqOp,
const Instruction *FDivInst,
853 float ReqdDivAccuracy)
const {
856 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
864 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
872 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
876 return emitFrexpDiv(Builder, Num, Den, DivFMF);
894bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
895 if (DisableFDivExpand)
910 FastMathFlags SqrtFMF;
915 Value *RsqOp =
nullptr;
917 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
918 DenII->hasOneUse()) {
920 SqrtFMF = SqrtOp->getFastMathFlags();
921 if (canOptimizeWithRsq(DivFMF, SqrtFMF))
922 RsqOp = SqrtOp->getOperand(0);
926 if (!IsFloat && !RsqOp)
938 const bool AllowInaccurateRcp = DivFMF.
approxFunc();
939 if (!RsqOp && AllowInaccurateRcp)
943 if (IsFloat && ReqdAccuracy < 1.0f)
950 SmallVector<Value *, 4> NumVals;
951 SmallVector<Value *, 4> DenVals;
952 SmallVector<Value *, 4> RsqDenVals;
959 SmallVector<Value *, 4> ResultVals(NumVals.
size());
960 for (
int I = 0,
E = NumVals.
size();
I !=
E; ++
I) {
961 Value *NumElt = NumVals[
I];
962 Value *DenElt = DenVals[
I];
963 Value *RsqDenElt = RsqOp ? RsqDenVals[
I] :
nullptr;
966 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
975 NewEltInst->copyMetadata(FDiv);
978 ResultVals[
I] = NewElt;
986 DeadVals.push_back(&FDiv);
997 Value *LHS_EXT64 = Builder.CreateZExt(
LHS, I64Ty);
998 Value *RHS_EXT64 = Builder.CreateZExt(
RHS, I64Ty);
999 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1000 Value *
Lo = Builder.CreateTrunc(MUL64, I32Ty);
1001 Value *
Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1002 Hi = Builder.CreateTrunc(
Hi, I32Ty);
1003 return std::pair(
Lo,
Hi);
1014unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &
I,
Value *Num,
1016 unsigned MaxDivBits,
1017 bool IsSigned)
const {
1024 unsigned DivBits = SSBits - RHSSignBits + 1;
1025 if (DivBits > MaxDivBits)
1030 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1031 DivBits = SSBits - SignBits + 1;
1041 unsigned DivBits = SSBits - RHSSignBits;
1042 if (DivBits > MaxDivBits)
1050 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1051 DivBits = SSBits - SignBits;
1058 BinaryOperator &
I,
Value *Num,
1059 Value *Den,
bool IsDiv,
1060 bool IsSigned)
const {
1061 unsigned DivBits = getDivNumBits(
I, Num, Den, 24, IsSigned);
1064 return expandDivRem24Impl(Builder,
I, Num, Den, DivBits, IsDiv, IsSigned);
1067Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1069 unsigned DivBits,
bool IsDiv,
bool IsSigned)
const {
1075 ConstantInt *One = Builder.
getInt32(1);
1119 {FQNeg->
getType()}, {FQNeg, FB, FA}, FQ);
1147 if (DivBits != 0 && DivBits < 32) {
1150 int InRegBits = 32 - DivBits;
1152 Res = Builder.
CreateShl(Res, InRegBits);
1155 ConstantInt *TruncMask
1156 = Builder.
getInt32((UINT64_C(1) << DivBits) - 1);
1157 Res = Builder.
CreateAnd(Res, TruncMask);
1168bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &
I,
1174 if (
C->getType()->getScalarSizeInBits() <= 32)
1190 if (BinOpDen->getOpcode() == Instruction::Shl &&
1207 return Builder.CreateAShr(V, Builder.getInt32(31));
1214 assert(
Opc == Instruction::URem ||
Opc == Instruction::UDiv ||
1215 Opc == Instruction::SRem ||
Opc == Instruction::SDiv);
1221 if (divHasSpecialOptimization(
I,
X,
Y))
1224 bool IsDiv =
Opc == Instruction::UDiv ||
Opc == Instruction::SDiv;
1225 bool IsSigned =
Opc == Instruction::SRem ||
Opc == Instruction::SDiv;
1227 Type *Ty =
X->getType();
1241 if (
Value *Res = expandDivRem24(Builder,
I,
X,
Y, IsDiv, IsSigned)) {
1247 ConstantInt *One = Builder.
getInt32(1);
1249 Value *Sign =
nullptr;
1254 Sign = IsDiv ? Builder.
CreateXor(SignX, SignY) : SignX;
1335 BinaryOperator &
I,
Value *Num,
1337 if (!ExpandDiv64InIR && divHasSpecialOptimization(
I, Num, Den))
1342 bool IsDiv =
Opc == Instruction::SDiv ||
Opc == Instruction::UDiv;
1343 bool IsSigned =
Opc == Instruction::SDiv ||
Opc == Instruction::SRem;
1345 unsigned NumDivBits = getDivNumBits(
I, Num, Den, 32, IsSigned);
1346 if (NumDivBits > 32)
1349 Value *Narrowed =
nullptr;
1350 if (NumDivBits <= 24) {
1351 Narrowed = expandDivRem24Impl(Builder,
I, Num, Den, NumDivBits,
1353 }
else if (NumDivBits <= 32) {
1354 Narrowed = expandDivRem32(Builder,
I, Num, Den);
1365void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &
I)
const {
1368 if (
Opc == Instruction::UDiv ||
Opc == Instruction::SDiv) {
1373 if (
Opc == Instruction::URem ||
Opc == Instruction::SRem) {
1393bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *
I) {
1394 unsigned Opc =
I->getOpcode();
1395 Type *OldType =
I->getType();
1397 if (
Opc != Instruction::Add &&
Opc != Instruction::Mul)
1402 if (
Opc != Instruction::Add &&
Opc != Instruction::Mul)
1404 "Instruction::Mul.");
1408 MaxBitsNeeded = std::max<unsigned>(
bit_ceil(MaxBitsNeeded), 8);
1409 Type *NewType =
DL.getSmallestLegalIntType(
I->getContext(), MaxBitsNeeded);
1413 if (NewBit >= OrigBit)
1425 int NumOfNonConstOps = 2;
1428 NumOfNonConstOps = 1;
1438 if (NewCost >= OldCost)
1449 DeadVals.push_back(
I);
1453bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &
I) {
1454 if (foldBinOpIntoSelect(
I))
1457 if (UseMul24Intrin && replaceMulWithMul24(
I))
1459 if (tryNarrowMathIfNoOverflow(&
I))
1464 Type *Ty =
I.getType();
1465 Value *NewDiv =
nullptr;
1470 if ((
Opc == Instruction::URem ||
Opc == Instruction::UDiv ||
1471 Opc == Instruction::SRem ||
Opc == Instruction::SDiv) &&
1473 !DisableIDivExpand) {
1474 Value *Num =
I.getOperand(0);
1475 Value *Den =
I.getOperand(1);
1482 for (
unsigned N = 0,
E = VT->getNumElements();
N !=
E; ++
N) {
1487 if (ScalarSize <= 32) {
1488 NewElt = expandDivRem32(Builder,
I, NumEltN, DenEltN);
1494 NewElt = shrinkDivRem64(Builder,
I, NumEltN, DenEltN);
1508 NewEltI->copyIRFlags(&
I);
1513 if (ScalarSize <= 32)
1514 NewDiv = expandDivRem32(Builder,
I, Num, Den);
1516 NewDiv = shrinkDivRem64(Builder,
I, Num, Den);
1523 I.replaceAllUsesWith(NewDiv);
1524 DeadVals.push_back(&
I);
1529 if (ExpandDiv64InIR) {
1531 for (BinaryOperator *Div : Div64ToExpand) {
1532 expandDivRem64(*Div);
1541bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &
I) {
1547 canWidenScalarExtLoad(
I)) {
1557 if (
auto *
Range =
WidenLoad->getMetadata(LLVMContext::MD_range)) {
1558 ConstantInt *
Lower =
1561 if (
Lower->isNullValue()) {
1562 WidenLoad->setMetadata(LLVMContext::MD_range,
nullptr);
1570 WidenLoad->setMetadata(LLVMContext::MD_range,
1575 int TySize =
DL.getTypeSizeInBits(
I.getType());
1580 DeadVals.push_back(&
I);
1587bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &
I) {
1608 Value *Fract =
nullptr;
1609 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1610 CmpVal == matchFractPat(*IIFalse)) {
1612 Fract = applyFractPat(Builder, CmpVal);
1613 }
else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1614 CmpVal == matchFractPat(*IITrue)) {
1616 Fract = applyFractPat(Builder, CmpVal);
1621 I.replaceAllUsesWith(Fract);
1622 DeadVals.push_back(&
I);
1629 return IA && IB && IA->getParent() == IB->getParent();
1639 const Value *CurVal = V;
1642 BitVector EltsCovered(FVT->getNumElements());
1649 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1652 const auto *VecSrc = IE->getOperand(0);
1661 EltsCovered.
set(Idx->getZExtValue());
1664 if (EltsCovered.
all())
1691 const auto [It, Inserted] = SeenPHIs.
insert(&
I);
1695 for (
const Value *Inc :
I.incoming_values()) {
1700 for (
const User *U :
I.users()) {
1706bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(
const PHINode &
I) {
1708 if (
const auto It = BreakPhiNodesCache.find(&
I);
1709 It != BreakPhiNodesCache.end())
1718 SmallPtrSet<const PHINode *, 8> WorkList;
1724 for (
const PHINode *WLP : WorkList) {
1725 assert(BreakPhiNodesCache.count(WLP) == 0);
1740 const auto Threshold = (
alignTo(WorkList.size() * 2, 3) / 3);
1741 unsigned NumBreakablePHIs = 0;
1742 bool CanBreak =
false;
1743 for (
const PHINode *Cur : WorkList) {
1751 if (++NumBreakablePHIs >= Threshold) {
1758 for (
const PHINode *Cur : WorkList)
1759 BreakPhiNodesCache[Cur] = CanBreak;
1808 Value *&Res = SlicedVals[{BB, Inc}];
1814 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1820 Res =
B.CreateShuffleVector(Inc, Mask, NewValName);
1822 Res =
B.CreateExtractElement(Inc,
Idx, NewValName);
1831bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &
I) {
1847 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1850 if (!ForceBreakLargePHIs && !canBreakPHINode(
I))
1853 std::vector<VectorSlice> Slices;
1860 const unsigned EltSize =
DL.getTypeSizeInBits(EltTy);
1862 if (EltSize == 8 || EltSize == 16) {
1863 const unsigned SubVecSize = (32 / EltSize);
1865 for (
unsigned End =
alignDown(NumElts, SubVecSize); Idx < End;
1867 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1871 for (; Idx < NumElts; ++Idx)
1872 Slices.emplace_back(EltTy, Idx, 1);
1875 assert(Slices.size() > 1);
1881 B.SetCurrentDebugLocation(
I.getDebugLoc());
1883 unsigned IncNameSuffix = 0;
1884 for (VectorSlice &S : Slices) {
1887 B.SetInsertPoint(
I.getParent()->getFirstNonPHIIt());
1888 S.NewPHI =
B.CreatePHI(S.Ty,
I.getNumIncomingValues());
1890 for (
const auto &[Idx, BB] :
enumerate(
I.blocks())) {
1891 S.NewPHI->addIncoming(S.getSlicedVal(BB,
I.getIncomingValue(Idx),
1892 "largephi.extractslice" +
1893 std::to_string(IncNameSuffix++)),
1900 unsigned NameSuffix = 0;
1901 for (VectorSlice &S : Slices) {
1902 const auto ValName =
"largephi.insertslice" + std::to_string(NameSuffix++);
1904 Vec =
B.CreateInsertVector(FVT, Vec, S.NewPHI, S.Idx, ValName);
1906 Vec =
B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
1909 I.replaceAllUsesWith(Vec);
1910 DeadVals.push_back(&
I);
1933 Load && Load->hasMetadata(LLVMContext::MD_nonnull))
1952 assert(SrcPtrKB.getBitWidth() ==
DL.getPointerSizeInBits(AS));
1953 assert((NullVal == 0 || NullVal == -1) &&
1954 "don't know how to check for this null value!");
1955 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
1958bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &
I) {
1962 if (
I.getType()->isVectorTy())
1967 const unsigned SrcAS =
I.getSrcAddressSpace();
1968 const unsigned DstAS =
I.getDestAddressSpace();
1970 bool CanLower =
false;
1988 auto *Intrin =
B.CreateIntrinsic(
1989 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
1990 I.replaceAllUsesWith(Intrin);
1991 DeadVals.push_back(&
I);
1995bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &
I) {
1996 switch (
I.getIntrinsicID()) {
1997 case Intrinsic::minnum:
1998 case Intrinsic::minimumnum:
1999 case Intrinsic::minimum:
2000 return visitFMinLike(
I);
2001 case Intrinsic::sqrt:
2002 return visitSqrt(
I);
2003 case Intrinsic::amdgcn_mbcnt_lo:
2004 return visitMbcntLo(
I);
2005 case Intrinsic::amdgcn_mbcnt_hi:
2006 return visitMbcntHi(
I);
2019Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &
I) {
2027 if (IID != Intrinsic::minnum && IID != Intrinsic::minimum &&
2028 IID != Intrinsic::minimumnum)
2031 Type *Ty =
I.getType();
2035 Value *Arg0 =
I.getArgOperand(0);
2036 Value *Arg1 =
I.getArgOperand(1);
2044 One.convert(
C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
2060 SmallVector<Value *, 4> FractVals;
2063 SmallVector<Value *, 4> ResultVals(FractVals.
size());
2066 for (
unsigned I = 0,
E = FractVals.
size();
I !=
E; ++
I) {
2074bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &
I) {
2075 Value *FractArg = matchFractPat(
I);
2085 FastMathFlags FMF =
I.getFastMathFlags();
2089 Value *Fract = applyFractPat(Builder, FractArg);
2091 I.replaceAllUsesWith(Fract);
2092 DeadVals.push_back(&
I);
2097bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2113 if (ReqdAccuracy < 1.0f)
2117 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2121 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2125 SmallVector<Value *, 4> SrcVals;
2128 SmallVector<Value *, 4> ResultVals(SrcVals.
size());
2129 for (
int I = 0,
E = SrcVals.
size();
I !=
E; ++
I) {
2131 ResultVals[
I] = Builder.
CreateCall(getSqrtF32(), SrcVals[
I]);
2133 ResultVals[
I] = emitSqrtIEEE2ULP(Builder, SrcVals[
I], SqrtFMF);
2139 DeadVals.push_back(&Sqrt);
2143bool AMDGPUCodeGenPrepare::runOnFunction(Function &
F) {
2144 if (skipFunction(
F))
2147 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2151 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2152 const TargetLibraryInfo *TLI =
2153 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
F);
2154 AssumptionCache *AC =
2155 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
2156 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2157 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() :
nullptr;
2159 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2160 return AMDGPUCodeGenPrepareImpl(
F, TM, TLI, AC, DT, UA).run();
2170 AMDGPUCodeGenPrepareImpl Impl(
F, ATM, TLI, AC, DT, UA);
2174 if (!Impl.FlowChanged)
2180 "AMDGPU IR optimizations",
false,
false)
2189 CallInst *Tid =
B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
2190 ST.makeLIDRangeMetadata(Tid);
2195void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &
I)
const {
2197 CallInst *Tid = createWorkitemIdX(
B);
2203void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(
2204 Instruction &
I,
unsigned WaveSize)
const {
2206 CallInst *Tid = createWorkitemIdX(
B);
2208 Value *AndInst =
B.CreateAnd(Tid, Mask);
2216bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &
I,
2217 unsigned Wave)
const {
2224 if (*MaybeX == Wave) {
2225 replaceWithWorkitemIdX(
I);
2232 replaceWithMaskedWorkitemIdX(
I, Wave);
2240bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &
I)
const {
2256bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &
I)
const {
2269 if (*MaybeX == Wave) {
2280 using namespace PatternMatch;
2288 return tryReplaceWithWorkitemId(
I, Wave);
2291char AMDGPUCodeGenPrepare::ID = 0;
2294 return new AMDGPUCodeGenPrepare();
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasMadMacF32Insts() const
bool has16BitInsts() const
bool hasFastFMAF32() const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
unsigned getWavefrontSize() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
This class represents a conversion between pointers from one address space to another.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
BinaryOps getOpcode() const
bool all() const
all - Returns true if all bits are set.
Represents analyses that only rely on functions' control flow.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
static LLVM_ABI Constant * getInfinity(Type *Ty, bool Negative=false)
static LLVM_ABI Constant * getZero(Type *Ty, bool Negative=false)
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
void setFast(bool B=true)
bool allowReciprocal() const
void setNoNaNs(bool B=true)
bool allowContract() const
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
LLVM_ABI Value * createIsFPClass(Value *FPNum, unsigned Test)
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateFMA(Value *Factor1, Value *Factor2, Value *Summand, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fma intrinsic.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Value * CreateFCmpOEQ(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
LLVM_ABI const fltSemantics & getFltSemantics() const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
ap_match< APFloat > m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
FunctionAddr VTableAddr Value
LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
LLVM_ABI void ReplaceInstWithValue(BasicBlock::iterator &BI, Value *V)
Replace all uses of an instruction (specified by BI) with a value, then remove and delete the origina...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
auto dyn_cast_or_null(const Y &Val)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
auto reverse(ContainerTy &&C)
LLVM_ABI bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
FunctionPass * createAMDGPUCodeGenPreparePass()
To bit_cast(const From &from) noexcept
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
DWARFExpression::Operation Op
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool isKnownNeverNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return true if the given value is known to have exactly one bit set when defined.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
constexpr bool inputsAreZero() const
Return true if input denormals must be implicitly treated as 0.
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
bool isNegative() const
Returns true if this value is known to be negative.
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.
LLVM_ABI bool isKnownNeverLogicalZero(DenormalMode Mode) const
Return true if it's known this can never be interpreted as a zero.
bool isKnownNeverPosInfinity() const
Return true if it's known this can never be +infinity.