29#include "llvm/IR/IntrinsicsAMDGPU.h"
40#define DEBUG_TYPE "amdgpu-codegenprepare"
48 "amdgpu-codegenprepare-widen-constant-loads",
49 cl::desc(
"Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
54 BreakLargePHIs(
"amdgpu-codegenprepare-break-large-phis",
55 cl::desc(
"Break large PHI nodes for DAGISel"),
59 ForceBreakLargePHIs(
"amdgpu-codegenprepare-force-break-large-phis",
60 cl::desc(
"For testing purposes, always break large "
61 "PHIs even if it isn't profitable."),
65 "amdgpu-codegenprepare-break-large-phis-threshold",
66 cl::desc(
"Minimum type size in bits for breaking large PHI nodes"),
70 "amdgpu-codegenprepare-mul24",
71 cl::desc(
"Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
77 "amdgpu-codegenprepare-expand-div64",
78 cl::desc(
"Expand 64-bit division in AMDGPUCodeGenPrepare"),
85 "amdgpu-codegenprepare-disable-idiv-expansion",
86 cl::desc(
"Prevent expanding integer division in AMDGPUCodeGenPrepare"),
92 "amdgpu-codegenprepare-disable-fdiv-expansion",
93 cl::desc(
"Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
97class AMDGPUCodeGenPrepareImpl
98 :
public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
107 const bool HasFP32DenormalFlush;
108 bool FlowChanged =
false;
109 mutable Function *SqrtF32 =
nullptr;
110 mutable Function *LdexpF32 =
nullptr;
119 DL(
F.getDataLayout()), SQ(
DL, TLI, DT, AC),
129 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
139 F.getParent(), Intrinsic::ldexp,
140 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
144 bool canBreakPHINode(
const PHINode &
I);
147 bool isLegalFloatingTy(
const Type *
T)
const;
156 bool canIgnoreDenormalInput(
const Value *V,
const Instruction *CtxI)
const {
157 return HasFP32DenormalFlush ||
182 unsigned MaxDivBits,
bool Signed)
const;
187 bool IsDiv,
bool IsSigned)
const;
191 bool IsDiv,
bool IsSigned)
const;
209 bool canWidenScalarExtLoad(
LoadInst &
I)
const;
224 float ReqdAccuracy)
const;
229 float ReqdAccuracy)
const;
231 std::pair<Value *, Value *> getFrexpResults(
IRBuilder<> &Builder,
235 bool IsNegative)
const;
242 bool IsNegative)
const;
246 void replaceWithMaskedWorkitemIdX(
Instruction &
I,
unsigned WaveSize)
const;
247 bool tryReplaceWithWorkitemId(
Instruction &
I,
unsigned Wave)
const;
282 if (!ExpandDiv64InIR)
286 StringRef getPassName()
const override {
return "AMDGPU IR optimizations"; }
291bool AMDGPUCodeGenPrepareImpl::run() {
292 BreakPhiNodesCache.clear();
293 bool MadeChange =
false;
305 while (!DeadVals.empty()) {
313bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(
const Type *Ty)
const {
315 (Ty->
isHalfTy() && ST.has16BitInsts());
318bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &
I)
const {
319 Type *Ty =
I.getType();
320 int TySize =
DL.getTypeSizeInBits(Ty);
321 Align Alignment =
DL.getValueOrABITypeAlignment(
I.getAlign(), Ty);
323 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.
isUniformAtDef(&
I);
327AMDGPUCodeGenPrepareImpl::numBitsUnsigned(
Value *
Op,
328 const Instruction *CtxI)
const {
333AMDGPUCodeGenPrepareImpl::numBitsSigned(
Value *
Op,
334 const Instruction *CtxI)
const {
346 for (
int I = 0,
E = VT->getNumElements();
I !=
E; ++
I)
347 Values.
push_back(Builder.CreateExtractElement(V,
I));
353 if (!Ty->isVectorTy()) {
359 for (
int I = 0,
E = Values.
size();
I !=
E; ++
I)
360 NewVal = Builder.CreateInsertElement(NewVal, Values[
I],
I);
365bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &
I)
const {
366 if (
I.getOpcode() != Instruction::Mul)
369 Type *Ty =
I.getType();
371 if (
Size <= 16 && ST.has16BitInsts())
381 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
383 unsigned LHSBits = 0, RHSBits = 0;
384 bool IsSigned =
false;
386 if (ST.
hasMulU24() && (LHSBits = numBitsUnsigned(
LHS, &
I)) <= 24 &&
387 (RHSBits = numBitsUnsigned(
RHS, &
I)) <= 24) {
390 }
else if (ST.
hasMulI24() && (LHSBits = numBitsSigned(
LHS, &
I)) <= 24 &&
391 (RHSBits = numBitsSigned(
RHS, &
I)) <= 24) {
397 SmallVector<Value *, 4> LHSVals;
398 SmallVector<Value *, 4> RHSVals;
399 SmallVector<Value *, 4> ResultVals;
403 IntegerType *I32Ty = Builder.getInt32Ty();
404 IntegerType *IntrinTy =
Size > 32 ? Builder.getInt64Ty() : I32Ty;
405 Type *DstTy = LHSVals[0]->getType();
407 for (
int I = 0,
E = LHSVals.
size();
I !=
E; ++
I) {
408 Value *
LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[
I], I32Ty)
409 : Builder.CreateZExtOrTrunc(LHSVals[
I], I32Ty);
410 Value *
RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[
I], I32Ty)
411 : Builder.CreateZExtOrTrunc(RHSVals[
I], I32Ty);
413 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
415 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
416 : Builder.CreateZExtOrTrunc(Result, DstTy);
422 I.replaceAllUsesWith(NewVal);
423 DeadVals.push_back(&
I);
443bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO)
const {
464 if (!CBO || !CT || !CF)
491 Builder.setFastMathFlags(FPOp->getFastMathFlags());
497 DeadVals.push_back(&BO);
499 DeadVals.push_back(CastOp);
500 DeadVals.push_back(Sel);
504std::pair<Value *, Value *>
505AMDGPUCodeGenPrepareImpl::getFrexpResults(
IRBuilder<> &Builder,
507 Type *Ty = Src->getType();
520 : Builder.CreateExtractValue(Frexp, {1});
521 return {FrexpMant, FrexpExp};
527 bool IsNegative)
const {
542 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
545 return Builder.
CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
551 FastMathFlags FMF)
const {
555 if (HasFP32DenormalFlush && ST.
hasFractBug() && !ST.hasFastFMAF32() &&
561 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder,
RHS);
566 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder,
LHS);
578 FastMathFlags FMF)
const {
579 Type *Ty = Src->getType();
583 Builder.
CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
586 Value *InputScaleFactor =
593 Value *OutputScaleFactor =
595 return Builder.
CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
606 Type *Ty = Src->getType();
610 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
611 Constant *One = ConstantFP::get(Ty, 1.0);
612 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
614 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
616 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
618 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
619 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
620 Value *OutputScaleFactor = Builder.CreateSelect(
621 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
623 return Builder.CreateFMul(Rsq, OutputScaleFactor);
629 FastMathFlags SqrtFMF,
630 FastMathFlags DivFMF,
631 const Instruction *CtxI,
632 bool IsNegative)
const {
654 bool MaybePosInf = !SqrtFMF.
noInfs() && !DivFMF.
noInfs();
655 bool MaybeZero = !DivFMF.
noInfs();
657 DenormalMode DenormMode;
664 if (Interested !=
fcNone) {
669 DenormMode =
F.getDenormalMode(
X->getType()->getFltSemantics());
675 if (MaybeZero || MaybePosInf) {
677 if (MaybePosInf && MaybeZero) {
678 if (DenormMode.
Input != DenormalMode::DenormalModeKind::Dynamic) {
693 }
else if (MaybeZero) {
706 Value *
E = Builder.
CreateFMA(NegXY0, Y0, ConstantFP::get(
X->getType(), 1.0));
711 ConstantFP::get(
X->getType(), 0.5));
713 return Builder.
CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);
716bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
717 FastMathFlags SqrtFMF)
const {
723Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
725 const FastMathFlags SqrtFMF,
const Instruction *CtxI)
const {
736 bool IsNegative =
false;
741 IRBuilder<>::FastMathFlagGuard Guard(Builder);
746 canIgnoreDenormalInput(Den, CtxI)) {
757 return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative);
771 Value *Den, FastMathFlags FMF,
772 const Instruction *CtxI)
const {
779 bool IsNegative =
false;
784 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
805 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
814 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
819 Value *Recip = emitRcpIEEE1ULP(Builder, Den,
false);
833Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
836 if (ReqdAccuracy < 2.5f)
842 bool NumIsOne =
false;
844 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
852 if (!HasFP32DenormalFlush && !NumIsOne)
855 return Builder.
CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});
858Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
860 FastMathFlags SqrtFMF,
Value *RsqOp,
const Instruction *FDivInst,
861 float ReqdDivAccuracy)
const {
864 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
872 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
880 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
884 return emitFrexpDiv(Builder, Num, Den, DivFMF);
902bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
903 if (DisableFDivExpand)
918 FastMathFlags SqrtFMF;
923 Value *RsqOp =
nullptr;
925 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
926 DenII->hasOneUse()) {
928 SqrtFMF = SqrtOp->getFastMathFlags();
929 if (canOptimizeWithRsq(DivFMF, SqrtFMF))
930 RsqOp = SqrtOp->getOperand(0);
934 if (!IsFloat && !RsqOp)
946 const bool AllowInaccurateRcp = DivFMF.
approxFunc();
947 if (!RsqOp && AllowInaccurateRcp)
951 if (IsFloat && ReqdAccuracy < 1.0f)
958 SmallVector<Value *, 4> NumVals;
959 SmallVector<Value *, 4> DenVals;
960 SmallVector<Value *, 4> RsqDenVals;
967 SmallVector<Value *, 4> ResultVals(NumVals.
size());
968 for (
int I = 0,
E = NumVals.
size();
I !=
E; ++
I) {
969 Value *NumElt = NumVals[
I];
970 Value *DenElt = DenVals[
I];
971 Value *RsqDenElt = RsqOp ? RsqDenVals[
I] :
nullptr;
974 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
983 NewEltInst->copyMetadata(FDiv);
986 ResultVals[
I] = NewElt;
994 DeadVals.push_back(&FDiv);
1005 Value *LHS_EXT64 = Builder.CreateZExt(
LHS, I64Ty);
1006 Value *RHS_EXT64 = Builder.CreateZExt(
RHS, I64Ty);
1007 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1008 Value *
Lo = Builder.CreateTrunc(MUL64, I32Ty);
1009 Value *
Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1010 Hi = Builder.CreateTrunc(
Hi, I32Ty);
1011 return std::pair(
Lo,
Hi);
1022unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &
I,
Value *Num,
1024 unsigned MaxDivBits,
1025 bool IsSigned)
const {
1032 unsigned DivBits = SSBits - RHSSignBits + 1;
1033 if (DivBits > MaxDivBits)
1038 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1039 DivBits = SSBits - SignBits + 1;
1049 unsigned DivBits = SSBits - RHSSignBits;
1050 if (DivBits > MaxDivBits)
1058 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1059 DivBits = SSBits - SignBits;
1066 BinaryOperator &
I,
Value *Num,
1067 Value *Den,
bool IsDiv,
1068 bool IsSigned)
const {
1069 unsigned DivBits = getDivNumBits(
I, Num, Den, 24, IsSigned);
1072 return expandDivRem24Impl(Builder,
I, Num, Den, DivBits, IsDiv, IsSigned);
1075Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1077 unsigned DivBits,
bool IsDiv,
bool IsSigned)
const {
1083 ConstantInt *One = Builder.
getInt32(1);
1123 auto FMAD = !ST.hasMadMacF32Insts()
1127 {FQNeg->
getType()}, {FQNeg, FB, FA}, FQ);
1155 if (DivBits != 0 && DivBits < 32) {
1158 int InRegBits = 32 - DivBits;
1160 Res = Builder.
CreateShl(Res, InRegBits);
1163 ConstantInt *TruncMask
1164 = Builder.
getInt32((UINT64_C(1) << DivBits) - 1);
1165 Res = Builder.
CreateAnd(Res, TruncMask);
1176bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &
I,
1182 if (
C->getType()->getScalarSizeInBits() <= 32)
1198 if (BinOpDen->getOpcode() == Instruction::Shl &&
1216 return Builder.CreateAShr(V, Builder.getInt32(31));
1223 assert(
Opc == Instruction::URem ||
Opc == Instruction::UDiv ||
1224 Opc == Instruction::SRem ||
Opc == Instruction::SDiv);
1230 if (divHasSpecialOptimization(
I,
X,
Y))
1233 bool IsDiv =
Opc == Instruction::UDiv ||
Opc == Instruction::SDiv;
1234 bool IsSigned =
Opc == Instruction::SRem ||
Opc == Instruction::SDiv;
1236 Type *Ty =
X->getType();
1250 if (
Value *Res = expandDivRem24(Builder,
I,
X,
Y, IsDiv, IsSigned)) {
1256 ConstantInt *One = Builder.
getInt32(1);
1258 Value *Sign =
nullptr;
1263 Sign = IsDiv ? Builder.
CreateXor(SignX, SignY) : SignX;
1344 BinaryOperator &
I,
Value *Num,
1346 if (!ExpandDiv64InIR && divHasSpecialOptimization(
I, Num, Den))
1351 bool IsDiv =
Opc == Instruction::SDiv ||
Opc == Instruction::UDiv;
1352 bool IsSigned =
Opc == Instruction::SDiv ||
Opc == Instruction::SRem;
1354 unsigned NumDivBits = getDivNumBits(
I, Num, Den, 32, IsSigned);
1355 if (NumDivBits > 32)
1358 Value *Narrowed =
nullptr;
1359 if (NumDivBits <= 24) {
1360 Narrowed = expandDivRem24Impl(Builder,
I, Num, Den, NumDivBits,
1362 }
else if (NumDivBits <= 32) {
1363 Narrowed = expandDivRem32(Builder,
I, Num, Den);
1374void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &
I)
const {
1377 if (
Opc == Instruction::UDiv ||
Opc == Instruction::SDiv) {
1382 if (
Opc == Instruction::URem ||
Opc == Instruction::SRem) {
1402bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *
I) {
1403 unsigned Opc =
I->getOpcode();
1404 Type *OldType =
I->getType();
1406 if (
Opc != Instruction::Add &&
Opc != Instruction::Mul)
1411 if (
Opc != Instruction::Add &&
Opc != Instruction::Mul)
1413 "Instruction::Mul.");
1417 MaxBitsNeeded = std::max<unsigned>(
bit_ceil(MaxBitsNeeded), 8);
1418 Type *NewType =
DL.getSmallestLegalIntType(
I->getContext(), MaxBitsNeeded);
1422 if (NewBit >= OrigBit)
1434 int NumOfNonConstOps = 2;
1437 NumOfNonConstOps = 1;
1447 if (NewCost >= OldCost)
1458 DeadVals.push_back(
I);
1462bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &
I) {
1463 if (foldBinOpIntoSelect(
I))
1466 if (UseMul24Intrin && replaceMulWithMul24(
I))
1468 if (tryNarrowMathIfNoOverflow(&
I))
1473 Type *Ty =
I.getType();
1474 Value *NewDiv =
nullptr;
1479 if ((
Opc == Instruction::URem ||
Opc == Instruction::UDiv ||
1480 Opc == Instruction::SRem ||
Opc == Instruction::SDiv) &&
1482 !DisableIDivExpand) {
1483 Value *Num =
I.getOperand(0);
1484 Value *Den =
I.getOperand(1);
1491 for (
unsigned N = 0,
E = VT->getNumElements();
N !=
E; ++
N) {
1496 if (ScalarSize <= 32) {
1497 NewElt = expandDivRem32(Builder,
I, NumEltN, DenEltN);
1503 NewElt = shrinkDivRem64(Builder,
I, NumEltN, DenEltN);
1517 NewEltI->copyIRFlags(&
I);
1522 if (ScalarSize <= 32)
1523 NewDiv = expandDivRem32(Builder,
I, Num, Den);
1525 NewDiv = shrinkDivRem64(Builder,
I, Num, Den);
1532 I.replaceAllUsesWith(NewDiv);
1533 DeadVals.push_back(&
I);
1538 if (ExpandDiv64InIR) {
1540 for (BinaryOperator *Div : Div64ToExpand) {
1541 expandDivRem64(*Div);
1550bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &
I) {
1556 canWidenScalarExtLoad(
I)) {
1566 if (
auto *
Range =
WidenLoad->getMetadata(LLVMContext::MD_range)) {
1567 ConstantInt *
Lower =
1570 if (
Lower->isNullValue()) {
1571 WidenLoad->setMetadata(LLVMContext::MD_range,
nullptr);
1579 WidenLoad->setMetadata(LLVMContext::MD_range,
1584 int TySize =
DL.getTypeSizeInBits(
I.getType());
1589 DeadVals.push_back(&
I);
1596bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &
I) {
1602 Value *Fract =
nullptr;
1611 Value *FractSrc = matchFractPatImpl(*
X, *
C);
1616 Fract = applyFractPat(Builder, FractSrc);
1626 CmpPredicate IsNanPred;
1635 if (IsNanPred == FCmpInst::FCMP_UNO && TrueVal == CmpVal &&
1636 CmpVal == matchFractPatNanAvoidant(*FalseVal)) {
1638 Fract = applyFractPat(Builder, CmpVal);
1639 }
else if (IsNanPred == FCmpInst::FCMP_ORD && FalseVal == CmpVal) {
1640 if (CmpVal == matchFractPatNanAvoidant(*TrueVal)) {
1642 Fract = applyFractPat(Builder, CmpVal);
1646 CmpPredicate PredInf;
1652 PredInf != FCmpInst::FCMP_UNE ||
1653 CmpVal != matchFractPatNanAvoidant(*IfNotInf))
1663 Value *NewFract = applyFractPat(Builder, CmpVal);
1667 DeadVals.push_back(ClampInfSelect->
getOperand(1));
1671 Fract = ClampInfSelect;
1678 I.replaceAllUsesWith(Fract);
1679 DeadVals.push_back(&
I);
1686 return IA && IB && IA->getParent() == IB->getParent();
1696 const Value *CurVal = V;
1699 BitVector EltsCovered(FVT->getNumElements());
1706 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1709 const auto *VecSrc = IE->getOperand(0);
1718 EltsCovered.
set(Idx->getZExtValue());
1721 if (EltsCovered.
all())
1748 const auto [It, Inserted] = SeenPHIs.
insert(&
I);
1752 for (
const Value *Inc :
I.incoming_values()) {
1757 for (
const User *U :
I.users()) {
1763bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(
const PHINode &
I) {
1765 if (
const auto It = BreakPhiNodesCache.find(&
I);
1766 It != BreakPhiNodesCache.end())
1775 SmallPtrSet<const PHINode *, 8> WorkList;
1781 for (
const PHINode *WLP : WorkList) {
1782 assert(BreakPhiNodesCache.count(WLP) == 0);
1797 const auto Threshold = (
alignTo(WorkList.size() * 2, 3) / 3);
1798 unsigned NumBreakablePHIs = 0;
1799 bool CanBreak =
false;
1800 for (
const PHINode *Cur : WorkList) {
1808 if (++NumBreakablePHIs >= Threshold) {
1815 for (
const PHINode *Cur : WorkList)
1816 BreakPhiNodesCache[Cur] = CanBreak;
1865 Value *&Res = SlicedVals[{BB, Inc}];
1871 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1877 Res =
B.CreateShuffleVector(Inc, Mask, NewValName);
1879 Res =
B.CreateExtractElement(Inc,
Idx, NewValName);
1888bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &
I) {
1904 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1907 if (!ForceBreakLargePHIs && !canBreakPHINode(
I))
1910 std::vector<VectorSlice> Slices;
1917 const unsigned EltSize =
DL.getTypeSizeInBits(EltTy);
1919 if (EltSize == 8 || EltSize == 16) {
1920 const unsigned SubVecSize = (32 / EltSize);
1922 for (
unsigned End =
alignDown(NumElts, SubVecSize); Idx < End;
1924 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1928 for (; Idx < NumElts; ++Idx)
1929 Slices.emplace_back(EltTy, Idx, 1);
1932 assert(Slices.size() > 1);
1938 B.SetCurrentDebugLocation(
I.getDebugLoc());
1940 unsigned IncNameSuffix = 0;
1941 for (VectorSlice &S : Slices) {
1944 B.SetInsertPoint(
I.getParent()->getFirstNonPHIIt());
1945 S.NewPHI =
B.CreatePHI(S.Ty,
I.getNumIncomingValues());
1947 for (
const auto &[Idx, BB] :
enumerate(
I.blocks())) {
1948 S.NewPHI->addIncoming(S.getSlicedVal(BB,
I.getIncomingValue(Idx),
1949 "largephi.extractslice" +
1950 std::to_string(IncNameSuffix++)),
1957 unsigned NameSuffix = 0;
1958 for (VectorSlice &S : Slices) {
1959 const auto ValName =
"largephi.insertslice" + std::to_string(NameSuffix++);
1961 Vec =
B.CreateInsertVector(FVT, Vec, S.NewPHI, S.Idx, ValName);
1963 Vec =
B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
1966 I.replaceAllUsesWith(Vec);
1967 DeadVals.push_back(&
I);
1990 Load && Load->hasMetadata(LLVMContext::MD_nonnull))
2009 assert(SrcPtrKB.getBitWidth() ==
DL.getPointerSizeInBits(AS));
2010 assert((NullVal == 0 || NullVal == -1) &&
2011 "don't know how to check for this null value!");
2012 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2015bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &
I) {
2019 if (
I.getType()->isVectorTy())
2024 const unsigned SrcAS =
I.getSrcAddressSpace();
2025 const unsigned DstAS =
I.getDestAddressSpace();
2027 bool CanLower =
false;
2045 auto *Intrin =
B.CreateIntrinsic(
2046 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2047 I.replaceAllUsesWith(Intrin);
2048 DeadVals.push_back(&
I);
2052bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &
I) {
2055 case Intrinsic::minnum:
2056 case Intrinsic::minimumnum:
2057 case Intrinsic::minimum:
2058 return visitFMinLike(
I);
2059 case Intrinsic::sqrt:
2060 return visitSqrt(
I);
2061 case Intrinsic::log:
2062 case Intrinsic::log10:
2064 case Intrinsic::log2:
2067 case Intrinsic::amdgcn_mbcnt_lo:
2068 return visitMbcntLo(
I);
2069 case Intrinsic::amdgcn_mbcnt_hi:
2070 return visitMbcntHi(
I);
2071 case Intrinsic::vector_reduce_add:
2072 return visitVectorReduceAdd(
I);
2073 case Intrinsic::uadd_sat:
2074 case Intrinsic::sadd_sat:
2075 return visitSaturatingAdd(
I);
2083Value *AMDGPUCodeGenPrepareImpl::matchFractPatImpl(
Value &FractSrc,
2084 const APFloat &
C)
const {
2093 OneNextDown.
next(
true);
2096 if (OneNextDown !=
C)
2116Value *AMDGPUCodeGenPrepareImpl::matchFractPatNanAvoidant(
Value &V) {
2128 return matchFractPatImpl(*Arg0, *
C);
2133 SmallVector<Value *, 4> FractVals;
2136 SmallVector<Value *, 4> ResultVals(FractVals.
size());
2139 for (
unsigned I = 0,
E = FractVals.
size();
I !=
E; ++
I) {
2147bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &
I) {
2155 FractArg = matchFractPatImpl(*
X, *
C);
2160 FractArg = matchFractPatNanAvoidant(
I);
2172 FastMathFlags FMF =
I.getFastMathFlags();
2176 Value *Fract = applyFractPat(Builder, FractArg);
2178 I.replaceAllUsesWith(Fract);
2179 DeadVals.push_back(&
I);
2184bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2200 if (ReqdAccuracy < 1.0f)
2204 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2208 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2212 SmallVector<Value *, 4> SrcVals;
2215 SmallVector<Value *, 4> ResultVals(SrcVals.
size());
2216 for (
int I = 0,
E = SrcVals.
size();
I !=
E; ++
I) {
2218 ResultVals[
I] = Builder.
CreateCall(getSqrtF32(), SrcVals[
I]);
2220 ResultVals[
I] = emitSqrtIEEE2ULP(Builder, SrcVals[
I], SqrtFMF);
2226 DeadVals.push_back(&Sqrt);
2231bool AMDGPUCodeGenPrepareImpl::visitLog(FPMathOperator &Log,
2237 FastMathFlags FMF =
Log.getFastMathFlags();
2244 if (
Log.getFPAccuracy() < 1.80f)
2255 double Log2BaseInverted =
2262 Log.replaceAllUsesWith(
Mul);
2263 DeadVals.push_back(&Log);
2267bool AMDGPUCodeGenPrepare::runOnFunction(Function &
F) {
2268 if (skipFunction(
F))
2271 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2275 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2276 const TargetLibraryInfo *TLI =
2277 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
F);
2278 AssumptionCache *AC =
2279 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
2280 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2281 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() :
nullptr;
2283 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2284 return AMDGPUCodeGenPrepareImpl(
F, TM, TLI, AC, DT, UA).run();
2294 AMDGPUCodeGenPrepareImpl Impl(
F, ATM, TLI, AC, DT, UA);
2298 if (!Impl.FlowChanged)
2304 "AMDGPU IR optimizations",
false,
false)
2313 CallInst *Tid =
B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
2314 ST.makeLIDRangeMetadata(Tid);
2319void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &
I)
const {
2321 CallInst *Tid = createWorkitemIdX(
B);
2327void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(
2328 Instruction &
I,
unsigned WaveSize)
const {
2330 CallInst *Tid = createWorkitemIdX(
B);
2332 Value *AndInst =
B.CreateAnd(Tid, Mask);
2340bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &
I,
2341 unsigned Wave)
const {
2348 if (*MaybeX == Wave) {
2349 replaceWithWorkitemIdX(
I);
2356 replaceWithMaskedWorkitemIdX(
I, Wave);
2364bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &
I)
const {
2380bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &
I)
const {
2393 if (*MaybeX == Wave) {
2404 using namespace PatternMatch;
2412 return tryReplaceWithWorkitemId(
I, Wave);
2438 Value *ExtSrc0, *ExtSrc1;
2458bool AMDGPUCodeGenPrepareImpl::visitVectorReduceAdd(IntrinsicInst &
I) {
2460 if (!ST.hasDot7Insts() || (!ST.hasDot1Insts() && !ST.hasDot8Insts()))
2463 Value *
A =
nullptr, *
B =
nullptr;
2466 bool IsSigned =
false;
2473 LLVMContext &Ctx =
I.getContext();
2474 Type *I32Ty = Type::getInt32Ty(Ctx);
2482 Value *Acc = ConstantInt::get(I32Ty, 0);
2486 IsSigned ? Intrinsic::amdgcn_sdot4 : Intrinsic::amdgcn_udot4;
2491 I.replaceAllUsesWith(Dot);
2492 DeadVals.push_back(&
I);
2500bool AMDGPUCodeGenPrepareImpl::visitSaturatingAdd(IntrinsicInst &
I) {
2502 if (!ST.hasDot7Insts() || (!ST.hasDot1Insts() && !ST.hasDot8Insts()))
2506 bool IsSigned = (IID == Intrinsic::sadd_sat);
2509 Value *Op0 =
I.getArgOperand(0);
2510 Value *Op1 =
I.getArgOperand(1);
2511 Value *MulOp =
nullptr;
2512 Value *Accum =
nullptr;
2513 IntrinsicInst *ReduceInst =
nullptr;
2518 }
else if (
match(Op1,
2526 Value *
A =
nullptr, *
B =
nullptr;
2531 LLVMContext &Ctx =
I.getContext();
2532 Type *I32Ty = Type::getInt32Ty(Ctx);
2543 IsSigned ? Intrinsic::amdgcn_sdot4 : Intrinsic::amdgcn_udot4;
2548 I.replaceAllUsesWith(Dot);
2549 DeadVals.push_back(&
I);
2552 DeadVals.push_back(ReduceInst);
2557char AMDGPUCodeGenPrepare::ID = 0;
2560 return new AMDGPUCodeGenPrepare();
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static bool matchDot4Pattern(Value *MulOp, Value *&A, Value *&B, bool IsSigned)
Helper to match the dot4 pattern: mul(zext/sext <4 x i8>, zext/sext <4 x i8>) Returns true if pattern...
static bool isV4I8(Type *Ty)
Check if type is <4 x i8>.
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
static void visit(BasicBlock &Start, std::function< bool(BasicBlock *)> op)
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
unsigned getWavefrontSize() const
static APFloat getOne(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative One.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
opStatus next(bool nextDown)
This class represents a conversion between pointers from one address space to another.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
BinaryOps getOpcode() const
BitVector & set()
Set all bits in the bitvector.
bool all() const
Returns true if all bits are set.
Represents analyses that only rely on functions' control flow.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
static LLVM_ABI Constant * getInfinity(Type *Ty, bool Negative=false)
static LLVM_ABI Constant * getZero(Type *Ty, bool Negative=false)
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Utility class for floating point operations which can have information about relaxed accuracy require...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
void setFast(bool B=true)
bool allowReciprocal() const
void setNoNaNs(bool B=true)
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
CallInst * CreateFAbs(Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fabs intrinsic.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
void SetCurrentDebugLocation(const DebugLoc &L)
Set location information used by debugging information.
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
LLVM_ABI Value * createIsFPClass(Value *FPNum, unsigned Test)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateFMA(Value *Factor1, Value *Factor2, Value *Summand, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fma intrinsic.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Value * CreateFCmpOEQ(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
void setOperand(unsigned i, Value *Val)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
MaxMin_match< FCmpInst, LHS, RHS, ufmin_pred_ty > m_UnordFMin(const LHS &L, const RHS &R)
Match an 'unordered' floating point minimum function.
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< typename m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty, typename m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty > m_FMinNum_or_FMinimumNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
ap_match< APFloat > m_APFloatAllowPoison(const APFloat *&Res)
Match APFloat while allowing poison in splat vector constants.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
cstfp_pred_ty< is_signed_inf< false > > m_PosInf()
Match a positive infinity FP constant.
cstfp_pred_ty< is_pos_zero_fp > m_PosZeroFP()
Match a floating-point positive zero.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
FunctionAddr VTableAddr Value
LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
LLVM_ABI void ReplaceInstWithValue(BasicBlock::iterator &BI, Value *V)
Replace all uses of an instruction (specified by BI) with a value, then remove and delete the origina...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
auto dyn_cast_or_null(const Y &Val)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
auto reverse(ContainerTy &&C)
LLVM_ABI bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
FunctionPass * createAMDGPUCodeGenPreparePass()
To bit_cast(const From &from) noexcept
DWARFExpression::Operation Op
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool isKnownNeverNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
unsigned Log2(Align A)
Returns the log2 of the alignment.
LLVM_ABI bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return true if the given value is known to have exactly one bit set when defined.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
constexpr bool inputsAreZero() const
Return true if input denormals must be implicitly treated as 0.
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
bool isNegative() const
Returns true if this value is known to be negative.
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.
LLVM_ABI bool isKnownNeverLogicalZero(DenormalMode Mode) const
Return true if it's known this can never be interpreted as a zero.
bool isKnownNeverPosInfinity() const
Return true if it's known this can never be +infinity.
SimplifyQuery getWithInstruction(const Instruction *I) const