27#include "llvm/IR/IntrinsicsAMDGPU.h"
35#define DEBUG_TYPE "amdgpu-codegenprepare"
43 "amdgpu-codegenprepare-widen-constant-loads",
44 cl::desc(
"Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
49 "amdgpu-codegenprepare-widen-16-bit-ops",
50 cl::desc(
"Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
55 BreakLargePHIs(
"amdgpu-codegenprepare-break-large-phis",
56 cl::desc(
"Break large PHI nodes for DAGISel"),
60 ForceBreakLargePHIs(
"amdgpu-codegenprepare-force-break-large-phis",
61 cl::desc(
"For testing purposes, always break large "
62 "PHIs even if it isn't profitable."),
66 "amdgpu-codegenprepare-break-large-phis-threshold",
67 cl::desc(
"Minimum type size in bits for breaking large PHI nodes"),
71 "amdgpu-codegenprepare-mul24",
72 cl::desc(
"Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc(
"Expand 64-bit division in AMDGPUCodeGenPrepare"),
86 "amdgpu-codegenprepare-disable-idiv-expansion",
87 cl::desc(
"Prevent expanding integer division in AMDGPUCodeGenPrepare"),
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
94 cl::desc(
"Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
98class AMDGPUCodeGenPrepareImpl
99 :
public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
109 bool HasUnsafeFPMath =
false;
110 bool HasFP32DenormalFlush =
false;
111 bool FlowChanged =
false;
112 mutable Function *SqrtF32 =
nullptr;
113 mutable Function *LdexpF32 =
nullptr;
137 bool canBreakPHINode(
const PHINode &
I);
144 unsigned getBaseElementBitWidth(
const Type *
T)
const;
161 bool needsPromotionToI32(
const Type *
T)
const;
164 bool isLegalFloatingTy(
const Type *
T)
const;
173 bool canIgnoreDenormalInput(
const Value *V,
const Instruction *CtxI)
const {
174 return HasFP32DenormalFlush ||
198 bool promoteUniformOpToI32(
ICmpInst &
I)
const;
227 unsigned numBitsUnsigned(
Value *
Op)
const;
232 unsigned numBitsSigned(
Value *
Op)
const;
246 unsigned AtLeast,
bool Signed)
const;
251 bool IsDiv,
bool IsSigned)
const;
255 bool IsDiv,
bool IsSigned)
const;
273 bool canWidenScalarExtLoad(
LoadInst &
I)
const;
288 float ReqdAccuracy)
const;
293 float ReqdAccuracy)
const;
295 std::pair<Value *, Value *> getFrexpResults(
IRBuilder<> &Builder,
299 bool IsNegative)
const;
325 AMDGPUCodeGenPrepareImpl Impl;
338 if (!ExpandDiv64InIR)
348bool AMDGPUCodeGenPrepareImpl::run(
Function &
F) {
349 BreakPhiNodesCache.clear();
350 bool MadeChange =
false;
355 NextBB = std::next(FI);
362 MadeChange |= visit(*
I);
366 if (NextInstBB != BB) {
377unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(
const Type *
T)
const {
378 assert(needsPromotionToI32(
T) &&
"T does not need promotion to i32");
380 if (
T->isIntegerTy())
381 return T->getIntegerBitWidth();
382 return cast<VectorType>(
T)->getElementType()->getIntegerBitWidth();
386 assert(needsPromotionToI32(
T) &&
"T does not need promotion to i32");
388 if (
T->isIntegerTy())
389 return B.getInt32Ty();
393bool AMDGPUCodeGenPrepareImpl::isSigned(
const BinaryOperator &
I)
const {
394 return I.getOpcode() == Instruction::AShr ||
395 I.getOpcode() == Instruction::SDiv ||
I.getOpcode() == Instruction::SRem;
398bool AMDGPUCodeGenPrepareImpl::isSigned(
const SelectInst &
I)
const {
399 return isa<ICmpInst>(
I.getOperand(0)) ?
400 cast<ICmpInst>(
I.getOperand(0))->isSigned() :
false;
403bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(
const Type *
T)
const {
411 if (
const VectorType *VT = dyn_cast<VectorType>(
T)) {
414 if (
ST->hasVOP3PInsts())
417 return needsPromotionToI32(VT->getElementType());
423bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(
const Type *Ty)
const {
430 switch (
I.getOpcode()) {
431 case Instruction::Shl:
432 case Instruction::Add:
433 case Instruction::Sub:
435 case Instruction::Mul:
436 return I.hasNoUnsignedWrap();
444 switch (
I.getOpcode()) {
445 case Instruction::Shl:
446 case Instruction::Add:
447 case Instruction::Mul:
449 case Instruction::Sub:
450 return I.hasNoUnsignedWrap();
456bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(
LoadInst &
I)
const {
457 Type *Ty =
I.getType();
459 int TySize =
DL.getTypeSizeInBits(Ty);
460 Align Alignment =
DL.getValueOrABITypeAlignment(
I.getAlign(), Ty);
462 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(&
I);
465bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(
BinaryOperator &
I)
const {
466 assert(needsPromotionToI32(
I.getType()) &&
467 "I does not need promotion to i32");
469 if (
I.getOpcode() == Instruction::SDiv ||
470 I.getOpcode() == Instruction::UDiv ||
471 I.getOpcode() == Instruction::SRem ||
472 I.getOpcode() == Instruction::URem)
476 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
478 Type *I32Ty = getI32Ty(Builder,
I.getType());
479 Value *ExtOp0 =
nullptr;
480 Value *ExtOp1 =
nullptr;
481 Value *ExtRes =
nullptr;
482 Value *TruncRes =
nullptr;
485 ExtOp0 = Builder.CreateSExt(
I.getOperand(0), I32Ty);
486 ExtOp1 = Builder.CreateSExt(
I.getOperand(1), I32Ty);
488 ExtOp0 = Builder.CreateZExt(
I.getOperand(0), I32Ty);
489 ExtOp1 = Builder.CreateZExt(
I.getOperand(1), I32Ty);
492 ExtRes = Builder.CreateBinOp(
I.getOpcode(), ExtOp0, ExtOp1);
493 if (
Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
495 Inst->setHasNoSignedWrap();
498 Inst->setHasNoUnsignedWrap();
500 if (
const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&
I))
501 Inst->setIsExact(ExactOp->isExact());
504 TruncRes = Builder.CreateTrunc(ExtRes,
I.getType());
506 I.replaceAllUsesWith(TruncRes);
512bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(
ICmpInst &
I)
const {
513 assert(needsPromotionToI32(
I.getOperand(0)->getType()) &&
514 "I does not need promotion to i32");
517 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
519 Type *I32Ty = getI32Ty(Builder,
I.getOperand(0)->getType());
520 Value *ExtOp0 =
nullptr;
521 Value *ExtOp1 =
nullptr;
522 Value *NewICmp =
nullptr;
525 ExtOp0 = Builder.CreateSExt(
I.getOperand(0), I32Ty);
526 ExtOp1 = Builder.CreateSExt(
I.getOperand(1), I32Ty);
528 ExtOp0 = Builder.CreateZExt(
I.getOperand(0), I32Ty);
529 ExtOp1 = Builder.CreateZExt(
I.getOperand(1), I32Ty);
531 NewICmp = Builder.CreateICmp(
I.getPredicate(), ExtOp0, ExtOp1);
533 I.replaceAllUsesWith(NewICmp);
539bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(
SelectInst &
I)
const {
540 assert(needsPromotionToI32(
I.getType()) &&
541 "I does not need promotion to i32");
544 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
546 Type *I32Ty = getI32Ty(Builder,
I.getType());
547 Value *ExtOp1 =
nullptr;
548 Value *ExtOp2 =
nullptr;
549 Value *ExtRes =
nullptr;
550 Value *TruncRes =
nullptr;
553 ExtOp1 = Builder.CreateSExt(
I.getOperand(1), I32Ty);
554 ExtOp2 = Builder.CreateSExt(
I.getOperand(2), I32Ty);
556 ExtOp1 = Builder.CreateZExt(
I.getOperand(1), I32Ty);
557 ExtOp2 = Builder.CreateZExt(
I.getOperand(2), I32Ty);
559 ExtRes = Builder.CreateSelect(
I.getOperand(0), ExtOp1, ExtOp2);
560 TruncRes = Builder.CreateTrunc(ExtRes,
I.getType());
562 I.replaceAllUsesWith(TruncRes);
568bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
570 assert(
I.getIntrinsicID() == Intrinsic::bitreverse &&
571 "I must be bitreverse intrinsic");
572 assert(needsPromotionToI32(
I.getType()) &&
573 "I does not need promotion to i32");
576 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
578 Type *I32Ty = getI32Ty(Builder,
I.getType());
581 Value *ExtOp = Builder.CreateZExt(
I.getOperand(0), I32Ty);
582 Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
584 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(
I.getType()));
586 Builder.CreateTrunc(LShrOp,
I.getType());
588 I.replaceAllUsesWith(TruncRes);
594unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(
Value *
Op)
const {
598unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(
Value *
Op)
const {
604 auto *VT = dyn_cast<FixedVectorType>(V->getType());
610 for (
int I = 0, E = VT->getNumElements();
I != E; ++
I)
623 for (
int I = 0, E = Values.
size();
I != E; ++
I)
629bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(
BinaryOperator &
I)
const {
630 if (
I.getOpcode() != Instruction::Mul)
633 Type *Ty =
I.getType();
635 if (Size <= 16 && ST->has16BitInsts())
639 if (UA->isUniform(&
I))
645 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
647 unsigned LHSBits = 0, RHSBits = 0;
648 bool IsSigned =
false;
650 if (
ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
651 (RHSBits = numBitsUnsigned(RHS)) <= 24) {
654 }
else if (
ST->hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
655 (RHSBits = numBitsSigned(RHS)) <= 24) {
669 Type *DstTy = LHSVals[0]->getType();
671 for (
int I = 0, E = LHSVals.
size();
I != E; ++
I) {
672 Value *
LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[
I], I32Ty)
673 : Builder.CreateZExtOrTrunc(LHSVals[
I], I32Ty);
674 Value *
RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[
I], I32Ty)
675 : Builder.CreateZExtOrTrunc(RHSVals[
I], I32Ty);
677 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
679 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
680 : Builder.CreateZExtOrTrunc(Result, DstTy);
686 I.replaceAllUsesWith(NewVal);
696 if (
SelectInst *Sel = dyn_cast<SelectInst>(V))
699 if ((Cast = dyn_cast<CastInst>(V))) {
707bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(
BinaryOperator &BO)
const {
728 if (!CBO || !CT || !CF)
743 if (!FoldedT || isa<ConstantExpr>(FoldedT))
749 if (!FoldedF || isa<ConstantExpr>(FoldedF))
754 if (
const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
755 Builder.setFastMathFlags(FPOp->getFastMathFlags());
768std::pair<Value *, Value *>
769AMDGPUCodeGenPrepareImpl::getFrexpResults(
IRBuilder<> &Builder,
771 Type *Ty = Src->getType();
785 return {FrexpMant, FrexpExp};
791 bool IsNegative)
const {
806 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
809 return Builder.
CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
819 if (HasFP32DenormalFlush &&
ST->hasFractBug() && !
ST->hasFastFMAF32() &&
825 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
830 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
843 Type *Ty = Src->getType();
847 Builder.
CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
850 Value *InputScaleFactor =
857 Value *OutputScaleFactor =
859 return Builder.
CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
870 Type *Ty = Src->getType();
874 Builder.
CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
875 Constant *One = ConstantFP::get(Ty, 1.0);
876 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
878 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
885 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
887 return Builder.
CreateFMul(Rsq, OutputScaleFactor);
890bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(
const FPMathOperator *SqrtOp,
898 return SqrtFMF.
approxFunc() || HasUnsafeFPMath ||
902Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
911 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
917 bool IsNegative =
false;
926 canIgnoreDenormalInput(Den, CtxI)) {
953 if (
const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
954 bool IsNegative =
false;
959 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
980 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
989 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
994 Value *Recip = emitRcpIEEE1ULP(Builder, Den,
false);
1008Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
1011 if (ReqdAccuracy < 2.5f)
1017 bool NumIsOne =
false;
1018 if (
const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
1019 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
1027 if (!HasFP32DenormalFlush && !NumIsOne)
1030 return Builder.
CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den});
1033Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
1036 float ReqdDivAccuracy)
const {
1039 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
1044 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
1052 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
1056 return emitFrexpDiv(Builder, Num, Den, DivFMF);
1075 if (DisableFDivExpand)
1094 Value *RsqOp =
nullptr;
1095 auto *DenII = dyn_cast<IntrinsicInst>(Den);
1096 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
1097 DenII->hasOneUse()) {
1098 const auto *SqrtOp = cast<FPMathOperator>(DenII);
1100 if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
1113 const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.
approxFunc();
1114 if (!RsqOp && AllowInaccurateRcp)
1118 if (ReqdAccuracy < 1.0f)
1135 for (
int I = 0, E = NumVals.
size();
I != E; ++
I) {
1136 Value *NumElt = NumVals[
I];
1137 Value *DenElt = DenVals[
I];
1138 Value *RsqDenElt = RsqOp ? RsqDenVals[
I] :
nullptr;
1141 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
1142 cast<Instruction>(FPOp), ReqdAccuracy);
1149 if (
auto *NewEltInst = dyn_cast<Instruction>(NewElt))
1150 NewEltInst->copyMetadata(FDiv);
1153 ResultVals[
I] = NewElt;
1168 Attribute Attr =
F.getFnAttribute(
"unsafe-fp-math");
1183 return std::pair(
Lo,
Hi);
1194 Value *Den,
unsigned AtLeast,
1195 bool IsSigned)
const {
1198 if (LHSSignBits < AtLeast)
1202 if (RHSSignBits < AtLeast)
1205 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1216 Value *Den,
bool IsDiv,
1217 bool IsSigned)
const {
1220 unsigned AtLeast = (SSBits <= 24) ? 0 : (SSBits - 24 + IsSigned);
1221 int DivBits = getDivNumBits(
I, Num, Den, AtLeast, IsSigned);
1224 return expandDivRem24Impl(Builder,
I, Num, Den, DivBits, IsDiv, IsSigned);
1227Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1229 unsigned DivBits,
bool IsDiv,
bool IsSigned)
const {
1276 auto FMAD = !
ST->hasMadMacF32Insts()
1280 {FQNeg->
getType()}, {FQNeg, FB, FA}, FQ);
1308 if (DivBits != 0 && DivBits < 32) {
1311 int InRegBits = 32 - DivBits;
1313 Res = Builder.
CreateShl(Res, InRegBits);
1317 = Builder.
getInt32((UINT64_C(1) << DivBits) - 1);
1318 Res = Builder.
CreateAnd(Res, TruncMask);
1329bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(
BinaryOperator &
I,
1332 if (
Constant *
C = dyn_cast<Constant>(Den)) {
1335 if (
C->getType()->getScalarSizeInBits() <= 32)
1351 if (BinOpDen->getOpcode() == Instruction::Shl &&
1352 isa<Constant>(BinOpDen->getOperand(0)) &&
1376 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1377 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1383 if (divHasSpecialOptimization(
I,
X,
Y))
1386 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1387 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1389 Type *Ty =
X->getType();
1403 if (
Value *Res = expandDivRem24(Builder,
I,
X,
Y, IsDiv, IsSigned)) {
1411 Value *Sign =
nullptr;
1416 Sign = IsDiv ? Builder.
CreateXor(SignX, SignY) : SignX;
1460 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1500 if (!ExpandDiv64InIR && divHasSpecialOptimization(
I, Num, Den))
1505 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1506 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1508 int NumDivBits = getDivNumBits(
I, Num, Den, 32, IsSigned);
1509 if (NumDivBits == -1)
1512 Value *Narrowed =
nullptr;
1513 if (NumDivBits <= 24) {
1514 Narrowed = expandDivRem24Impl(Builder,
I, Num, Den, NumDivBits,
1516 }
else if (NumDivBits <= 32) {
1517 Narrowed = expandDivRem32(Builder,
I, Num, Den);
1528void AMDGPUCodeGenPrepareImpl::expandDivRem64(
BinaryOperator &
I)
const {
1531 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1536 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1544bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(
BinaryOperator &
I) {
1545 if (foldBinOpIntoSelect(
I))
1548 if (
ST->has16BitInsts() && needsPromotionToI32(
I.getType()) &&
1549 UA->isUniform(&
I) && promoteUniformOpToI32(
I))
1552 if (UseMul24Intrin && replaceMulWithMul24(
I))
1555 bool Changed =
false;
1557 Type *Ty =
I.getType();
1558 Value *NewDiv =
nullptr;
1563 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1564 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1566 !DisableIDivExpand) {
1567 Value *Num =
I.getOperand(0);
1568 Value *Den =
I.getOperand(1);
1572 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1575 for (
unsigned N = 0, E = VT->getNumElements();
N != E; ++
N) {
1580 if (ScalarSize <= 32) {
1581 NewElt = expandDivRem32(Builder,
I, NumEltN, DenEltN);
1583 NewElt = Builder.
CreateBinOp(Opc, NumEltN, DenEltN);
1587 NewElt = shrinkDivRem64(Builder,
I, NumEltN, DenEltN);
1592 NewElt = Builder.
CreateBinOp(Opc, NumEltN, DenEltN);
1593 Div64ToExpand.
push_back(cast<BinaryOperator>(NewElt));
1597 if (
auto *NewEltI = dyn_cast<Instruction>(NewElt))
1598 NewEltI->copyIRFlags(&
I);
1603 if (ScalarSize <= 32)
1604 NewDiv = expandDivRem32(Builder,
I, Num, Den);
1606 NewDiv = shrinkDivRem64(Builder,
I, Num, Den);
1613 I.replaceAllUsesWith(NewDiv);
1614 I.eraseFromParent();
1619 if (ExpandDiv64InIR) {
1622 expandDivRem64(*Div);
1631bool AMDGPUCodeGenPrepareImpl::visitLoadInst(
LoadInst &
I) {
1637 canWidenScalarExtLoad(
I)) {
1649 mdconst::extract<ConstantInt>(
Range->getOperand(0));
1651 if (
Lower->isNullValue()) {
1652 WidenLoad->
setMetadata(LLVMContext::MD_range,
nullptr);
1665 int TySize =
Mod->getDataLayout().getTypeSizeInBits(
I.getType());
1669 I.replaceAllUsesWith(ValOrig);
1670 I.eraseFromParent();
1677bool AMDGPUCodeGenPrepareImpl::visitICmpInst(
ICmpInst &
I) {
1678 bool Changed =
false;
1680 if (
ST->has16BitInsts() && needsPromotionToI32(
I.getOperand(0)->getType()) &&
1682 Changed |= promoteUniformOpToI32(
I);
1687bool AMDGPUCodeGenPrepareImpl::visitSelectInst(
SelectInst &
I) {
1694 if (
ST->has16BitInsts() && needsPromotionToI32(
I.getType())) {
1695 if (UA->isUniform(&
I))
1696 return promoteUniformOpToI32(
I);
1711 auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal);
1712 auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal);
1714 Value *Fract =
nullptr;
1715 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1716 CmpVal == matchFractPat(*IIFalse)) {
1718 Fract = applyFractPat(Builder, CmpVal);
1719 }
else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1720 CmpVal == matchFractPat(*IITrue)) {
1722 Fract = applyFractPat(Builder, CmpVal);
1727 I.replaceAllUsesWith(Fract);
1733 const auto *IA = dyn_cast<Instruction>(
A);
1734 const auto *IB = dyn_cast<Instruction>(
B);
1735 return IA && IB && IA->getParent() == IB->getParent();
1741 const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
1745 const Value *CurVal = V;
1748 BitVector EltsCovered(FVT->getNumElements());
1749 while (
const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
1750 const auto *
Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
1755 if (!
Idx ||
Idx->getZExtValue() >= FVT->getNumElements())
1758 const auto *VecSrc = IE->getOperand(0);
1763 if (isa<Instruction>(VecSrc) && !
areInSameBB(VecSrc, IE))
1767 EltsCovered.
set(
Idx->getZExtValue());
1770 if (EltsCovered.
all())
1779 if (isa<Constant>(CurVal))
1786 if (
const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
1787 return isa<Constant>(SV->getOperand(1)) ||
1797 const auto [It, Inserted] = SeenPHIs.
insert(&
I);
1801 for (
const Value *Inc :
I.incoming_values()) {
1802 if (
const auto *PhiInc = dyn_cast<PHINode>(Inc))
1806 for (
const User *U :
I.users()) {
1807 if (
const auto *PhiU = dyn_cast<PHINode>(U))
1812bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(
const PHINode &
I) {
1814 if (
const auto It = BreakPhiNodesCache.find(&
I);
1815 It != BreakPhiNodesCache.end())
1830 for (
const PHINode *WLP : WorkList) {
1831 assert(BreakPhiNodesCache.count(WLP) == 0);
1846 const auto Threshold = (
alignTo(WorkList.size() * 2, 3) / 3);
1847 unsigned NumBreakablePHIs = 0;
1848 bool CanBreak =
false;
1849 for (
const PHINode *Cur : WorkList) {
1857 if (++NumBreakablePHIs >= Threshold) {
1864 for (
const PHINode *Cur : WorkList)
1865 BreakPhiNodesCache[Cur] = CanBreak;
1914 Value *&Res = SlicedVals[{BB, Inc}];
1919 if (
Instruction *IncInst = dyn_cast<Instruction>(Inc))
1920 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1926 Res =
B.CreateShuffleVector(Inc, Mask, NewValName);
1928 Res =
B.CreateExtractElement(Inc,
Idx, NewValName);
1937bool AMDGPUCodeGenPrepareImpl::visitPHINode(
PHINode &
I) {
1953 DL->getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1956 if (!ForceBreakLargePHIs && !canBreakPHINode(
I))
1959 std::vector<VectorSlice> Slices;
1966 const unsigned EltSize =
DL->getTypeSizeInBits(EltTy);
1968 if (EltSize == 8 || EltSize == 16) {
1969 const unsigned SubVecSize = (32 / EltSize);
1973 Slices.emplace_back(SubVecTy,
Idx, SubVecSize);
1977 for (;
Idx < NumElts; ++
Idx)
1978 Slices.emplace_back(EltTy,
Idx, 1);
1981 assert(Slices.size() > 1);
1987 B.SetCurrentDebugLocation(
I.getDebugLoc());
1989 unsigned IncNameSuffix = 0;
1993 B.SetInsertPoint(
I.getParent()->getFirstNonPHIIt());
1994 S.NewPHI =
B.CreatePHI(S.Ty,
I.getNumIncomingValues());
1997 S.NewPHI->addIncoming(S.getSlicedVal(BB,
I.getIncomingValue(
Idx),
1998 "largephi.extractslice" +
1999 std::to_string(IncNameSuffix++)),
2006 unsigned NameSuffix = 0;
2008 const auto ValName =
"largephi.insertslice" + std::to_string(NameSuffix++);
2011 B.CreateInsertVector(FVT, Vec, S.NewPHI,
B.getInt64(S.Idx), ValName);
2013 Vec =
B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
2016 I.replaceAllUsesWith(Vec);
2017 I.eraseFromParent();
2031 if (isa<BlockAddress>(V) || isa<GlobalValue>(V) || isa<AllocaInst>(V))
2035 if (
const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
2040 if (AS != cast<PointerType>(V->getType())->getAddressSpace())
2052 const auto NullVal = TM.getNullPointerValue(AS);
2054 assert(SrcPtrKB.getBitWidth() ==
DL.getPointerSizeInBits(AS));
2055 assert((NullVal == 0 || NullVal == -1) &&
2056 "don't know how to check for this null value!");
2057 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2064 if (
I.getType()->isVectorTy())
2069 const unsigned SrcAS =
I.getSrcAddressSpace();
2070 const unsigned DstAS =
I.getDestAddressSpace();
2072 bool CanLower =
false;
2090 auto *Intrin =
B.CreateIntrinsic(
2091 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2092 I.replaceAllUsesWith(Intrin);
2093 I.eraseFromParent();
2097bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(
IntrinsicInst &
I) {
2098 switch (
I.getIntrinsicID()) {
2099 case Intrinsic::bitreverse:
2100 return visitBitreverseIntrinsicInst(
I);
2101 case Intrinsic::minnum:
2102 return visitMinNum(
I);
2103 case Intrinsic::sqrt:
2104 return visitSqrt(
I);
2110bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(
IntrinsicInst &
I) {
2111 bool Changed =
false;
2113 if (
ST->has16BitInsts() && needsPromotionToI32(
I.getType()) &&
2115 Changed |= promoteUniformBitreverseToI32(
I);
2126 if (
ST->hasFractBug())
2129 if (
I.getIntrinsicID() != Intrinsic::minnum)
2132 Type *Ty =
I.getType();
2136 Value *Arg0 =
I.getArgOperand(0);
2137 Value *Arg1 =
I.getArgOperand(1);
2145 One.convert(
C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
2154 m_Intrinsic<Intrinsic::floor>(
m_Deferred(FloorSrc)))))
2167 for (
unsigned I = 0, E = FractVals.
size();
I != E; ++
I) {
2176 Value *FractArg = matchFractPat(
I);
2182 if (!
I.hasNoNaNs() &&
2191 Value *Fract = applyFractPat(Builder, FractArg);
2193 I.replaceAllUsesWith(Fract);
2205bool AMDGPUCodeGenPrepareImpl::visitSqrt(
IntrinsicInst &Sqrt) {
2221 if (ReqdAccuracy < 1.0f)
2229 if (FDiv && FDiv->
getOpcode() == Instruction::FDiv &&
2230 FDiv->getFPAccuracy() >= 1.0f &&
2237 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2241 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2249 for (
int I = 0, E = SrcVals.
size();
I != E; ++
I) {
2251 ResultVals[
I] = Builder.
CreateCall(getSqrtF32(), SrcVals[
I]);
2253 ResultVals[
I] = emitSqrtIEEE2ULP(Builder, SrcVals[
I], SqrtFMF);
2263bool AMDGPUCodeGenPrepare::doInitialization(
Module &M) {
2265 Impl.DL = &Impl.Mod->getDataLayout();
2266 Impl.SqrtF32 =
nullptr;
2267 Impl.LdexpF32 =
nullptr;
2271bool AMDGPUCodeGenPrepare::runOnFunction(
Function &
F) {
2272 if (skipFunction(
F))
2275 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2281 Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
F);
2283 Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
2284 Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2285 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2286 Impl.DT = DTWP ? &DTWP->getDomTree() :
nullptr;
2289 Impl.HasFP32DenormalFlush =
2296 AMDGPUCodeGenPrepareImpl Impl;
2297 Impl.Mod =
F.getParent();
2298 Impl.DL = &Impl.Mod->getDataLayout();
2307 Impl.HasFP32DenormalFlush =
2310 if (!Impl.FlowChanged)
2316 "AMDGPU IR optimizations",
false,
false)
2323char AMDGPUCodeGenPrepare::
ID = 0;
2326 return new AMDGPUCodeGenPrepare();
static bool promotedOpIsNSW(const Instruction &I)
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static bool promotedOpIsNUW(const Instruction &I)
static bool isOneOrNegOne(const Value *Val)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool hasUnsafeFPMath(const Function &F)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Legalize the Machine IR a function s Machine IR
Generic memory optimizations
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
support::ulittle16_t & Lo
support::ulittle16_t & Hi
Helper class for "break large PHIs" (visitPHINode).
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
This class represents a conversion between pointers from one address space to another.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
BinaryOps getOpcode() const
bool all() const
all - Returns true if all bits are set.
Represents analyses that only rely on functions' control flow.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
ConstantFP - Floating Point Values [float, double].
bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Utility class for floating point operations which can have information about relaxed accuracy require...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
void setFast(bool B=true)
bool allowReciprocal() const
void setNoNaNs(bool B=true)
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
BasicBlockListType::iterator iterator
This instruction compares its operands according to the predicate given to the constructor.
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args=std::nullopt, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
RetTy visitIntrinsicInst(IntrinsicInst &I)
RetTy visitPHINode(PHINode &I)
RetTy visitAddrSpaceCastInst(AddrSpaceCastInst &I)
RetTy visitBinaryOperator(BinaryOperator &I)
RetTy visitICmpInst(ICmpInst &I)
RetTy visitSelectInst(SelectInst &I)
void visitInstruction(Instruction &I)
RetTy visitLoadInst(LoadInst &I)
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
A Module instance is used to store all the information related to an LLVM module.
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
virtual bool doInitialization(Module &)
doInitialization - Virtual method overridden by subclasses to do any necessary initialization before ...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isVectorTy() const
True if this is an instance of VectorType.
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
static IntegerType * getInt32Ty(LLVMContext &C)
static Type * getFloatTy(LLVMContext &C)
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
CmpClass_match< LHS, RHS, FCmpInst, FCmpInst::Predicate > m_FCmp(FCmpInst::Predicate &Pred, const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
apfloat_match m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return true if the given value is known to have exactly one bit set when defined.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
FunctionPass * createAMDGPUCodeGenPreparePass()
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
void initializeAMDGPUCodeGenPreparePass(PassRegistry &)
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isKnownNeverNaN(const Value *V, unsigned Depth, const SimplifyQuery &SQ)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, unsigned Depth, const SimplifyQuery &SQ)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Get the upper bound on bit size for this Value Op as a signed integer.
CGPassBuilderOption getCGPassBuilderOption()
This struct is a compact representation of a valid (non-zero power of two) alignment.
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
bool isNegative() const
Returns true if this value is known to be negative.
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.