30#include "llvm/IR/IntrinsicsAMDGPU.h"
41#define DEBUG_TYPE "amdgpu-codegenprepare"
49 "amdgpu-codegenprepare-widen-constant-loads",
50 cl::desc(
"Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
55 BreakLargePHIs(
"amdgpu-codegenprepare-break-large-phis",
56 cl::desc(
"Break large PHI nodes for DAGISel"),
60 ForceBreakLargePHIs(
"amdgpu-codegenprepare-force-break-large-phis",
61 cl::desc(
"For testing purposes, always break large "
62 "PHIs even if it isn't profitable."),
66 "amdgpu-codegenprepare-break-large-phis-threshold",
67 cl::desc(
"Minimum type size in bits for breaking large PHI nodes"),
71 "amdgpu-codegenprepare-mul24",
72 cl::desc(
"Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc(
"Expand 64-bit division in AMDGPUCodeGenPrepare"),
86 "amdgpu-codegenprepare-disable-idiv-expansion",
87 cl::desc(
"Prevent expanding integer division in AMDGPUCodeGenPrepare"),
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
94 cl::desc(
"Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
98class AMDGPUCodeGenPrepareImpl
99 :
public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
108 const bool HasFP32DenormalFlush;
109 bool FlowChanged =
false;
110 mutable Function *SqrtF32 =
nullptr;
111 mutable Function *LdexpF32 =
nullptr;
120 DL(
F.getDataLayout()), SQ(
DL, TLI, DT, AC),
130 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
140 F.getParent(), Intrinsic::ldexp,
141 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
145 bool canBreakPHINode(
const PHINode &
I);
148 bool isLegalFloatingTy(
const Type *
T)
const;
157 bool canIgnoreDenormalInput(
const Value *V,
const Instruction *CtxI)
const {
158 return HasFP32DenormalFlush ||
183 unsigned MaxDivBits,
bool Signed)
const;
188 bool IsDiv,
bool IsSigned)
const;
192 bool IsDiv,
bool IsSigned)
const;
210 bool canWidenScalarExtLoad(
LoadInst &
I)
const;
225 float ReqdAccuracy)
const;
230 float ReqdAccuracy)
const;
232 std::pair<Value *, Value *> getFrexpResults(
IRBuilder<> &Builder,
236 bool IsNegative)
const;
243 bool IsNegative)
const;
247 void replaceWithMaskedWorkitemIdX(
Instruction &
I,
unsigned WaveSize)
const;
248 bool tryReplaceWithWorkitemId(
Instruction &
I,
unsigned Wave)
const;
283 if (!ExpandDiv64InIR)
287 StringRef getPassName()
const override {
return "AMDGPU IR optimizations"; }
292bool AMDGPUCodeGenPrepareImpl::run() {
293 BreakPhiNodesCache.clear();
294 bool MadeChange =
false;
306 while (!DeadVals.empty()) {
314bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(
const Type *Ty)
const {
316 (Ty->
isHalfTy() && ST.has16BitInsts());
319bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &
I)
const {
320 Type *Ty =
I.getType();
321 int TySize =
DL.getTypeSizeInBits(Ty);
322 Align Alignment =
DL.getValueOrABITypeAlignment(
I.getAlign(), Ty);
324 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.
isUniformAtDef(&
I);
328AMDGPUCodeGenPrepareImpl::numBitsUnsigned(
Value *
Op,
329 const Instruction *CtxI)
const {
334AMDGPUCodeGenPrepareImpl::numBitsSigned(
Value *
Op,
335 const Instruction *CtxI)
const {
347 for (
int I = 0,
E = VT->getNumElements();
I !=
E; ++
I)
348 Values.
push_back(Builder.CreateExtractElement(V,
I));
354 if (!Ty->isVectorTy()) {
360 for (
int I = 0,
E = Values.
size();
I !=
E; ++
I)
361 NewVal = Builder.CreateInsertElement(NewVal, Values[
I],
I);
366bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &
I)
const {
367 if (
I.getOpcode() != Instruction::Mul)
370 Type *Ty =
I.getType();
372 if (
Size <= 16 && ST.has16BitInsts())
382 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
384 unsigned LHSBits = 0, RHSBits = 0;
385 bool IsSigned =
false;
387 if (ST.
hasMulU24() && (LHSBits = numBitsUnsigned(
LHS, &
I)) <= 24 &&
388 (RHSBits = numBitsUnsigned(
RHS, &
I)) <= 24) {
391 }
else if (ST.
hasMulI24() && (LHSBits = numBitsSigned(
LHS, &
I)) <= 24 &&
392 (RHSBits = numBitsSigned(
RHS, &
I)) <= 24) {
398 SmallVector<Value *, 4> LHSVals;
399 SmallVector<Value *, 4> RHSVals;
400 SmallVector<Value *, 4> ResultVals;
404 IntegerType *I32Ty = Builder.getInt32Ty();
405 IntegerType *IntrinTy =
Size > 32 ? Builder.getInt64Ty() : I32Ty;
406 Type *DstTy = LHSVals[0]->getType();
408 for (
int I = 0,
E = LHSVals.
size();
I !=
E; ++
I) {
409 Value *
LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[
I], I32Ty)
410 : Builder.CreateZExtOrTrunc(LHSVals[
I], I32Ty);
411 Value *
RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[
I], I32Ty)
412 : Builder.CreateZExtOrTrunc(RHSVals[
I], I32Ty);
414 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
416 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
417 : Builder.CreateZExtOrTrunc(Result, DstTy);
423 I.replaceAllUsesWith(NewVal);
424 DeadVals.push_back(&
I);
444bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO)
const {
465 if (!CBO || !CT || !CF)
492 Builder.setFastMathFlags(FPOp->getFastMathFlags());
498 DeadVals.push_back(&BO);
500 DeadVals.push_back(CastOp);
501 DeadVals.push_back(Sel);
505std::pair<Value *, Value *>
506AMDGPUCodeGenPrepareImpl::getFrexpResults(
IRBuilder<> &Builder,
508 Type *Ty = Src->getType();
521 : Builder.CreateExtractValue(Frexp, {1});
522 return {FrexpMant, FrexpExp};
528 bool IsNegative)
const {
543 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
546 return Builder.
CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
552 FastMathFlags FMF)
const {
556 if (HasFP32DenormalFlush && ST.
hasFractBug() && !ST.hasFastFMAF32() &&
562 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder,
RHS);
567 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder,
LHS);
579 FastMathFlags FMF)
const {
580 Type *Ty = Src->getType();
584 Builder.
CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
587 Value *InputScaleFactor =
594 Value *OutputScaleFactor =
596 return Builder.
CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
607 Type *Ty = Src->getType();
611 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
612 Constant *One = ConstantFP::get(Ty, 1.0);
613 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
615 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
617 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
619 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
620 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
621 Value *OutputScaleFactor = Builder.CreateSelect(
622 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
624 return Builder.CreateFMul(Rsq, OutputScaleFactor);
630 FastMathFlags SqrtFMF,
631 FastMathFlags DivFMF,
632 const Instruction *CtxI,
633 bool IsNegative)
const {
655 bool MaybePosInf = !SqrtFMF.
noInfs() && !DivFMF.
noInfs();
656 bool MaybeZero = !DivFMF.
noInfs();
658 DenormalMode DenormMode;
665 if (Interested !=
fcNone) {
670 DenormMode =
F.getDenormalMode(
X->getType()->getFltSemantics());
676 if (MaybeZero || MaybePosInf) {
678 if (MaybePosInf && MaybeZero) {
679 if (DenormMode.
Input != DenormalMode::DenormalModeKind::Dynamic) {
694 }
else if (MaybeZero) {
707 Value *
E = Builder.
CreateFMA(NegXY0, Y0, ConstantFP::get(
X->getType(), 1.0));
712 ConstantFP::get(
X->getType(), 0.5));
714 return Builder.
CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);
717bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
718 FastMathFlags SqrtFMF)
const {
724Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
726 const FastMathFlags SqrtFMF,
const Instruction *CtxI)
const {
737 bool IsNegative =
false;
742 IRBuilder<>::FastMathFlagGuard Guard(Builder);
747 canIgnoreDenormalInput(Den, CtxI)) {
758 return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative);
772 Value *Den, FastMathFlags FMF,
773 const Instruction *CtxI)
const {
780 bool IsNegative =
false;
785 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
806 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
815 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
820 Value *Recip = emitRcpIEEE1ULP(Builder, Den,
false);
834Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
837 if (ReqdAccuracy < 2.5f)
843 bool NumIsOne =
false;
845 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
853 if (!HasFP32DenormalFlush && !NumIsOne)
856 return Builder.
CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});
859Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
861 FastMathFlags SqrtFMF,
Value *RsqOp,
const Instruction *FDivInst,
862 float ReqdDivAccuracy)
const {
865 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
873 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
881 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
885 return emitFrexpDiv(Builder, Num, Den, DivFMF);
903bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
904 if (DisableFDivExpand)
919 FastMathFlags SqrtFMF;
924 Value *RsqOp =
nullptr;
926 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
927 DenII->hasOneUse()) {
929 SqrtFMF = SqrtOp->getFastMathFlags();
930 if (canOptimizeWithRsq(DivFMF, SqrtFMF))
931 RsqOp = SqrtOp->getOperand(0);
935 if (!IsFloat && !RsqOp)
947 const bool AllowInaccurateRcp = DivFMF.
approxFunc();
948 if (!RsqOp && AllowInaccurateRcp)
952 if (IsFloat && ReqdAccuracy < 1.0f)
959 SmallVector<Value *, 4> NumVals;
960 SmallVector<Value *, 4> DenVals;
961 SmallVector<Value *, 4> RsqDenVals;
968 SmallVector<Value *, 4> ResultVals(NumVals.
size());
969 for (
int I = 0,
E = NumVals.
size();
I !=
E; ++
I) {
970 Value *NumElt = NumVals[
I];
971 Value *DenElt = DenVals[
I];
972 Value *RsqDenElt = RsqOp ? RsqDenVals[
I] :
nullptr;
975 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
984 NewEltInst->copyMetadata(FDiv);
987 ResultVals[
I] = NewElt;
995 DeadVals.push_back(&FDiv);
1006 Value *LHS_EXT64 = Builder.CreateZExt(
LHS, I64Ty);
1007 Value *RHS_EXT64 = Builder.CreateZExt(
RHS, I64Ty);
1008 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1009 Value *
Lo = Builder.CreateTrunc(MUL64, I32Ty);
1010 Value *
Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1011 Hi = Builder.CreateTrunc(
Hi, I32Ty);
1012 return std::pair(
Lo,
Hi);
1023unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &
I,
Value *Num,
1025 unsigned MaxDivBits,
1026 bool IsSigned)
const {
1033 unsigned DivBits = SSBits - RHSSignBits + 1;
1034 if (DivBits > MaxDivBits)
1039 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1040 DivBits = SSBits - SignBits + 1;
1048 if (RHSBits > MaxDivBits)
1054 unsigned DivBits = std::max(LHSBits, RHSBits);
1061 BinaryOperator &
I,
Value *Num,
1062 Value *Den,
bool IsDiv,
1063 bool IsSigned)
const {
1064 unsigned DivBits = getDivNumBits(
I, Num, Den, 24, IsSigned);
1076 if (DivBits > (IsSigned ? 24 : 23))
1078 return expandDivRem24Impl(Builder,
I, Num, Den, DivBits, IsDiv, IsSigned);
1081Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1083 unsigned DivBits,
bool IsDiv,
bool IsSigned)
const {
1089 ConstantInt *One = Builder.
getInt32(1);
1131 auto FMAD = !ST.hasMadMacF32Insts()
1163 if (DivBits != 0 && DivBits < 32) {
1166 int InRegBits = 32 - DivBits;
1168 Res = Builder.
CreateShl(Res, InRegBits);
1171 ConstantInt *TruncMask
1172 = Builder.
getInt32((UINT64_C(1) << DivBits) - 1);
1173 Res = Builder.
CreateAnd(Res, TruncMask);
1184bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &
I,
1190 if (
C->getType()->getScalarSizeInBits() <= 32)
1206 if (BinOpDen->getOpcode() == Instruction::Shl &&
1224 return Builder.CreateAShr(V, Builder.getInt32(31));
1231 assert(
Opc == Instruction::URem ||
Opc == Instruction::UDiv ||
1232 Opc == Instruction::SRem ||
Opc == Instruction::SDiv);
1238 if (divHasSpecialOptimization(
I,
X,
Y))
1241 bool IsDiv =
Opc == Instruction::UDiv ||
Opc == Instruction::SDiv;
1242 bool IsSigned =
Opc == Instruction::SRem ||
Opc == Instruction::SDiv;
1244 Type *Ty =
X->getType();
1258 if (
Value *Res = expandDivRem24(Builder,
I,
X,
Y, IsDiv, IsSigned)) {
1264 ConstantInt *One = Builder.
getInt32(1);
1266 Value *Sign =
nullptr;
1271 Sign = IsDiv ? Builder.
CreateXor(SignX, SignY) : SignX;
1352 BinaryOperator &
I,
Value *Num,
1354 if (!ExpandDiv64InIR && divHasSpecialOptimization(
I, Num, Den))
1359 bool IsDiv =
Opc == Instruction::SDiv ||
Opc == Instruction::UDiv;
1360 bool IsSigned =
Opc == Instruction::SDiv ||
Opc == Instruction::SRem;
1362 unsigned NumDivBits = getDivNumBits(
I, Num, Den, 32, IsSigned);
1363 if (NumDivBits > 32)
1366 Value *Narrowed =
nullptr;
1377 if (NumDivBits <= (IsSigned ? 24 : 23)) {
1378 Narrowed = expandDivRem24Impl(Builder,
I, Num, Den, NumDivBits,
1380 }
else if (NumDivBits <= 32) {
1381 Narrowed = expandDivRem32(Builder,
I, Num, Den);
1392void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &
I)
const {
1395 if (
Opc == Instruction::UDiv ||
Opc == Instruction::SDiv) {
1400 if (
Opc == Instruction::URem ||
Opc == Instruction::SRem) {
1420bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *
I) {
1421 unsigned Opc =
I->getOpcode();
1422 Type *OldType =
I->getType();
1424 if (
Opc != Instruction::Add &&
Opc != Instruction::Mul)
1429 if (
Opc != Instruction::Add &&
Opc != Instruction::Mul)
1431 "Instruction::Mul.");
1435 MaxBitsNeeded = std::max<unsigned>(
bit_ceil(MaxBitsNeeded), 8);
1436 Type *NewType =
DL.getSmallestLegalIntType(
I->getContext(), MaxBitsNeeded);
1440 if (NewBit >= OrigBit)
1452 int NumOfNonConstOps = 2;
1455 NumOfNonConstOps = 1;
1465 if (NewCost >= OldCost)
1476 DeadVals.push_back(
I);
1480bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &
I) {
1481 if (foldBinOpIntoSelect(
I))
1484 if (UseMul24Intrin && replaceMulWithMul24(
I))
1486 if (tryNarrowMathIfNoOverflow(&
I))
1491 Type *Ty =
I.getType();
1492 Value *NewDiv =
nullptr;
1497 if ((
Opc == Instruction::URem ||
Opc == Instruction::UDiv ||
1498 Opc == Instruction::SRem ||
Opc == Instruction::SDiv) &&
1500 !DisableIDivExpand) {
1501 Value *Num =
I.getOperand(0);
1502 Value *Den =
I.getOperand(1);
1509 for (
unsigned N = 0,
E = VT->getNumElements();
N !=
E; ++
N) {
1514 if (ScalarSize <= 32) {
1515 NewElt = expandDivRem32(Builder,
I, NumEltN, DenEltN);
1521 NewElt = shrinkDivRem64(Builder,
I, NumEltN, DenEltN);
1535 NewEltI->copyIRFlags(&
I);
1540 if (ScalarSize <= 32)
1541 NewDiv = expandDivRem32(Builder,
I, Num, Den);
1543 NewDiv = shrinkDivRem64(Builder,
I, Num, Den);
1550 I.replaceAllUsesWith(NewDiv);
1551 DeadVals.push_back(&
I);
1556 if (ExpandDiv64InIR) {
1558 for (BinaryOperator *Div : Div64ToExpand) {
1559 expandDivRem64(*Div);
1568bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &
I) {
1574 canWidenScalarExtLoad(
I)) {
1585 if (
auto *
Range =
I.getMetadata(LLVMContext::MD_range)) {
1588 if (!
Lower->isNullValue()) {
1595 WidenLoad->setMetadata(LLVMContext::MD_range,
1600 int TySize =
DL.getTypeSizeInBits(
I.getType());
1605 DeadVals.push_back(&
I);
1612bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &
I) {
1618 Value *Fract =
nullptr;
1627 Value *FractSrc = matchFractPatImpl(*
X, *
C);
1632 Fract = applyFractPat(Builder, FractSrc);
1642 CmpPredicate IsNanPred;
1651 if (IsNanPred == FCmpInst::FCMP_UNO && TrueVal == CmpVal &&
1652 CmpVal == matchFractPatNanAvoidant(*FalseVal)) {
1654 Fract = applyFractPat(Builder, CmpVal);
1655 }
else if (IsNanPred == FCmpInst::FCMP_ORD && FalseVal == CmpVal) {
1656 if (CmpVal == matchFractPatNanAvoidant(*TrueVal)) {
1658 Fract = applyFractPat(Builder, CmpVal);
1662 CmpPredicate PredInf;
1668 PredInf != FCmpInst::FCMP_UNE ||
1669 CmpVal != matchFractPatNanAvoidant(*IfNotInf))
1679 Value *NewFract = applyFractPat(Builder, CmpVal);
1683 DeadVals.push_back(ClampInfSelect->
getOperand(1));
1687 Fract = ClampInfSelect;
1694 I.replaceAllUsesWith(Fract);
1695 DeadVals.push_back(&
I);
1702 return IA && IB && IA->getParent() == IB->getParent();
1712 const Value *CurVal = V;
1715 BitVector EltsCovered(FVT->getNumElements());
1722 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1725 const auto *VecSrc = IE->getOperand(0);
1734 EltsCovered.
set(Idx->getZExtValue());
1737 if (EltsCovered.
all())
1764 const auto [It, Inserted] = SeenPHIs.
insert(&
I);
1768 for (
const Value *Inc :
I.incoming_values()) {
1773 for (
const User *U :
I.users()) {
1779bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(
const PHINode &
I) {
1781 if (
const auto It = BreakPhiNodesCache.find(&
I);
1782 It != BreakPhiNodesCache.end())
1791 SmallPtrSet<const PHINode *, 8> WorkList;
1797 for (
const PHINode *WLP : WorkList) {
1798 assert(BreakPhiNodesCache.count(WLP) == 0);
1813 const auto Threshold = (
alignTo(WorkList.size() * 2, 3) / 3);
1814 unsigned NumBreakablePHIs = 0;
1815 bool CanBreak =
false;
1816 for (
const PHINode *Cur : WorkList) {
1824 if (++NumBreakablePHIs >= Threshold) {
1831 for (
const PHINode *Cur : WorkList)
1832 BreakPhiNodesCache[Cur] = CanBreak;
1881 Value *&Res = SlicedVals[{BB, Inc}];
1887 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1893 Res =
B.CreateShuffleVector(Inc, Mask, NewValName);
1895 Res =
B.CreateExtractElement(Inc,
Idx, NewValName);
1904bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &
I) {
1920 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1923 if (!ForceBreakLargePHIs && !canBreakPHINode(
I))
1926 std::vector<VectorSlice> Slices;
1933 const unsigned EltSize =
DL.getTypeSizeInBits(EltTy);
1935 if (EltSize == 8 || EltSize == 16) {
1936 const unsigned SubVecSize = (32 / EltSize);
1938 for (
unsigned End =
alignDown(NumElts, SubVecSize); Idx < End;
1940 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1944 for (; Idx < NumElts; ++Idx)
1945 Slices.emplace_back(EltTy, Idx, 1);
1948 assert(Slices.size() > 1);
1954 B.SetCurrentDebugLocation(
I.getDebugLoc());
1956 unsigned IncNameSuffix = 0;
1957 for (VectorSlice &S : Slices) {
1960 B.SetInsertPoint(
I.getParent()->getFirstNonPHIIt());
1961 S.NewPHI =
B.CreatePHI(S.Ty,
I.getNumIncomingValues());
1963 for (
const auto &[Idx, BB] :
enumerate(
I.blocks())) {
1964 S.NewPHI->addIncoming(S.getSlicedVal(BB,
I.getIncomingValue(Idx),
1965 "largephi.extractslice" +
1966 std::to_string(IncNameSuffix++)),
1973 unsigned NameSuffix = 0;
1974 for (VectorSlice &S : Slices) {
1975 const auto ValName =
"largephi.insertslice" + std::to_string(NameSuffix++);
1977 Vec =
B.CreateInsertVector(FVT, Vec, S.NewPHI, S.Idx, ValName);
1979 Vec =
B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
1982 I.replaceAllUsesWith(Vec);
1983 DeadVals.push_back(&
I);
2006 Load && Load->hasMetadata(LLVMContext::MD_nonnull))
2025 assert(SrcPtrKB.getBitWidth() ==
DL.getPointerSizeInBits(AS));
2026 assert((NullVal == 0 || NullVal == -1) &&
2027 "don't know how to check for this null value!");
2028 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2031bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &
I) {
2035 if (
I.getType()->isVectorTy())
2040 const unsigned SrcAS =
I.getSrcAddressSpace();
2041 const unsigned DstAS =
I.getDestAddressSpace();
2043 bool CanLower =
false;
2061 auto *Intrin =
B.CreateIntrinsic(
2062 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2063 I.replaceAllUsesWith(Intrin);
2064 DeadVals.push_back(&
I);
2068bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &
I) {
2071 case Intrinsic::minnum:
2072 case Intrinsic::minimumnum:
2073 case Intrinsic::minimum:
2074 return visitFMinLike(
I);
2075 case Intrinsic::sqrt:
2076 return visitSqrt(
I);
2077 case Intrinsic::log:
2078 case Intrinsic::log10:
2080 case Intrinsic::log2:
2083 case Intrinsic::amdgcn_mbcnt_lo:
2084 return visitMbcntLo(
I);
2085 case Intrinsic::amdgcn_mbcnt_hi:
2086 return visitMbcntHi(
I);
2087 case Intrinsic::vector_reduce_add:
2088 return visitVectorReduceAdd(
I);
2089 case Intrinsic::uadd_sat:
2090 case Intrinsic::sadd_sat:
2091 return visitSaturatingAdd(
I);
2099Value *AMDGPUCodeGenPrepareImpl::matchFractPatImpl(
Value &FractSrc,
2100 const APFloat &
C)
const {
2109 OneNextDown.
next(
true);
2112 if (OneNextDown !=
C)
2132Value *AMDGPUCodeGenPrepareImpl::matchFractPatNanAvoidant(
Value &V) {
2144 return matchFractPatImpl(*Arg0, *
C);
2149 SmallVector<Value *, 4> FractVals;
2152 SmallVector<Value *, 4> ResultVals(FractVals.
size());
2155 for (
unsigned I = 0,
E = FractVals.
size();
I !=
E; ++
I) {
2163bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &
I) {
2171 FractArg = matchFractPatImpl(*
X, *
C);
2176 FractArg = matchFractPatNanAvoidant(
I);
2188 FastMathFlags FMF =
I.getFastMathFlags();
2192 Value *Fract = applyFractPat(Builder, FractArg);
2194 I.replaceAllUsesWith(Fract);
2195 DeadVals.push_back(&
I);
2200bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2216 if (ReqdAccuracy < 1.0f)
2220 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2224 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2228 SmallVector<Value *, 4> SrcVals;
2231 SmallVector<Value *, 4> ResultVals(SrcVals.
size());
2232 for (
int I = 0,
E = SrcVals.
size();
I !=
E; ++
I) {
2234 ResultVals[
I] = Builder.
CreateCall(getSqrtF32(), SrcVals[
I]);
2236 ResultVals[
I] = emitSqrtIEEE2ULP(Builder, SrcVals[
I], SqrtFMF);
2242 DeadVals.push_back(&Sqrt);
2247bool AMDGPUCodeGenPrepareImpl::visitLog(FPMathOperator &Log,
2253 FastMathFlags FMF =
Log.getFastMathFlags();
2260 if (
Log.getFPAccuracy() < 1.80f)
2271 double Log2BaseInverted =
2278 Log.replaceAllUsesWith(
Mul);
2279 DeadVals.push_back(&Log);
2283bool AMDGPUCodeGenPrepare::runOnFunction(Function &
F) {
2284 if (skipFunction(
F))
2287 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2291 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2292 const TargetLibraryInfo *TLI =
2293 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
F);
2294 AssumptionCache *AC =
2295 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
2296 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2297 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() :
nullptr;
2299 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2300 return AMDGPUCodeGenPrepareImpl(
F, TM, TLI, AC, DT, UA).run();
2310 AMDGPUCodeGenPrepareImpl Impl(
F, ATM, TLI, AC, DT, UA);
2314 if (!Impl.FlowChanged)
2320 "AMDGPU IR optimizations",
false,
false)
2329 CallInst *Tid =
B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
2330 ST.makeLIDRangeMetadata(Tid);
2335void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &
I)
const {
2337 CallInst *Tid = createWorkitemIdX(
B);
2343void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(
2344 Instruction &
I,
unsigned WaveSize)
const {
2346 CallInst *Tid = createWorkitemIdX(
B);
2348 Value *AndInst =
B.CreateAnd(Tid, Mask);
2356bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &
I,
2357 unsigned Wave)
const {
2364 if (*MaybeX == Wave) {
2365 replaceWithWorkitemIdX(
I);
2372 replaceWithMaskedWorkitemIdX(
I, Wave);
2380bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &
I)
const {
2396bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &
I)
const {
2409 if (*MaybeX == Wave) {
2420 using namespace PatternMatch;
2428 return tryReplaceWithWorkitemId(
I, Wave);
2454 Value *ExtSrc0, *ExtSrc1;
2474bool AMDGPUCodeGenPrepareImpl::visitVectorReduceAdd(IntrinsicInst &
I) {
2476 if (!ST.hasDot7Insts() || (!ST.hasDot1Insts() && !ST.hasDot8Insts()))
2479 Value *
A =
nullptr, *
B =
nullptr;
2482 bool IsSigned =
false;
2489 LLVMContext &Ctx =
I.getContext();
2490 Type *I32Ty = Type::getInt32Ty(Ctx);
2498 Value *Acc = ConstantInt::get(I32Ty, 0);
2502 IsSigned ? Intrinsic::amdgcn_sdot4 : Intrinsic::amdgcn_udot4;
2507 I.replaceAllUsesWith(Dot);
2508 DeadVals.push_back(&
I);
2516bool AMDGPUCodeGenPrepareImpl::visitSaturatingAdd(IntrinsicInst &
I) {
2518 if (!ST.hasDot7Insts() || (!ST.hasDot1Insts() && !ST.hasDot8Insts()))
2522 bool IsSigned = (IID == Intrinsic::sadd_sat);
2525 Value *Op0 =
I.getArgOperand(0);
2526 Value *Op1 =
I.getArgOperand(1);
2527 Value *MulOp =
nullptr;
2528 Value *Accum =
nullptr;
2529 IntrinsicInst *ReduceInst =
nullptr;
2534 }
else if (
match(Op1,
2542 Value *
A =
nullptr, *
B =
nullptr;
2547 LLVMContext &Ctx =
I.getContext();
2548 Type *I32Ty = Type::getInt32Ty(Ctx);
2559 IsSigned ? Intrinsic::amdgcn_sdot4 : Intrinsic::amdgcn_udot4;
2564 I.replaceAllUsesWith(Dot);
2565 DeadVals.push_back(&
I);
2568 DeadVals.push_back(ReduceInst);
2573char AMDGPUCodeGenPrepare::ID = 0;
2576 return new AMDGPUCodeGenPrepare();
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static bool matchDot4Pattern(Value *MulOp, Value *&A, Value *&B, bool IsSigned)
Helper to match the dot4 pattern: mul(zext/sext <4 x i8>, zext/sext <4 x i8>) Returns true if pattern...
static bool isV4I8(Type *Ty)
Check if type is <4 x i8>.
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
static Value * getOpcode(Value &V, Type &Ty, InstrumentationConfig &IConf, InstrumentorIRBuilderTy &IIRB)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
static void visit(BasicBlock &Start, std::function< bool(BasicBlock *)> op)
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
unsigned getWavefrontSize() const
static APFloat getOne(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative One.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
opStatus next(bool nextDown)
This class represents a conversion between pointers from one address space to another.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
BinaryOps getOpcode() const
BitVector & set()
Set all bits in the bitvector.
bool all() const
Returns true if all bits are set.
Represents analyses that only rely on functions' control flow.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
static LLVM_ABI ConstantFP * getZero(Type *Ty, bool Negative=false)
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
static LLVM_ABI ConstantFP * getInfinity(Type *Ty, bool Negative=false)
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Utility class for floating point operations which can have information about relaxed accuracy require...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
void setFast(bool B=true)
bool allowReciprocal() const
void setNoNaNs(bool B=true)
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
void SetCurrentDebugLocation(const DebugLoc &L)
Set location information used by debugging information.
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false, MDNode *FPMathTag=nullptr)
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFAbs(Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fabs intrinsic.
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
LLVM_ABI Value * createIsFPClass(Value *FPNum, unsigned Test)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateFMA(Value *Factor1, Value *Factor2, Value *Summand, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fma intrinsic.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Value * CreateFCmpOEQ(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
LLVM_ABI Value * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *Op, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Represent a constant reference to a string, i.e.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
void setOperand(unsigned i, Value *Val)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
void copyMetadataForWidenedLoad(LoadInst &Dest, const LoadInst &Source)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
MaxMin_match< FCmpInst, LHS, RHS, ufmin_pred_ty > m_UnordFMin(const LHS &L, const RHS &R)
Match an 'unordered' floating point minimum function.
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< typename m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty, typename m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty > m_FMinNum_or_FMinimumNum(const Opnd0 &Op0, const Opnd1 &Op1)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
ap_match< APFloat > m_APFloatAllowPoison(const APFloat *&Res)
Match APFloat while allowing poison in splat vector constants.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
m_Intrinsic_Ty< Opnd0, Opnd1 >::Ty m_FMinimum(const Opnd0 &Op0, const Opnd1 &Op1)
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
cstfp_pred_ty< is_signed_inf< false > > m_PosInf()
Match a positive infinity FP constant.
cstfp_pred_ty< is_pos_zero_fp > m_PosZeroFP()
Match a floating-point positive zero.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
FunctionAddr VTableAddr Value
LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
LLVM_ABI void ReplaceInstWithValue(BasicBlock::iterator &BI, Value *V)
Replace all uses of an instruction (specified by BI) with a value, then remove and delete the origina...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
auto dyn_cast_or_null(const Y &Val)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
auto reverse(ContainerTy &&C)
LLVM_ABI bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
FunctionPass * createAMDGPUCodeGenPreparePass()
To bit_cast(const From &from) noexcept
DWARFExpression::Operation Op
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool isKnownNeverNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
unsigned Log2(Align A)
Returns the log2 of the alignment.
LLVM_ABI bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return true if the given value is known to have exactly one bit set when defined.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
constexpr bool inputsAreZero() const
Return true if input denormals must be implicitly treated as 0.
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
bool isNegative() const
Returns true if this value is known to be negative.
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.
LLVM_ABI bool isKnownNeverLogicalZero(DenormalMode Mode) const
Return true if it's known this can never be interpreted as a zero.
bool isKnownNeverPosInfinity() const
Return true if it's known this can never be +infinity.
SimplifyQuery getWithInstruction(const Instruction *I) const