27#include "llvm/IR/IntrinsicsAMDGPU.h"
35#define DEBUG_TYPE "amdgpu-codegenprepare"
43 "amdgpu-codegenprepare-widen-constant-loads",
44 cl::desc(
"Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
49 "amdgpu-codegenprepare-widen-16-bit-ops",
50 cl::desc(
"Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
55 BreakLargePHIs(
"amdgpu-codegenprepare-break-large-phis",
56 cl::desc(
"Break large PHI nodes for DAGISel"),
60 ForceBreakLargePHIs(
"amdgpu-codegenprepare-force-break-large-phis",
61 cl::desc(
"For testing purposes, always break large "
62 "PHIs even if it isn't profitable."),
66 "amdgpu-codegenprepare-break-large-phis-threshold",
67 cl::desc(
"Minimum type size in bits for breaking large PHI nodes"),
71 "amdgpu-codegenprepare-mul24",
72 cl::desc(
"Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc(
"Expand 64-bit division in AMDGPUCodeGenPrepare"),
86 "amdgpu-codegenprepare-disable-idiv-expansion",
87 cl::desc(
"Prevent expanding integer division in AMDGPUCodeGenPrepare"),
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
94 cl::desc(
"Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
98static bool hasUnsafeFPMath(
const Function &
F) {
99 return F.getFnAttribute(
"unsafe-fp-math").getValueAsBool();
102class AMDGPUCodeGenPrepareImpl
103 :
public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
113 const bool HasUnsafeFPMath;
114 const bool HasFP32DenormalFlush;
115 bool FlowChanged =
false;
116 mutable Function *SqrtF32 =
nullptr;
117 mutable Function *LdexpF32 =
nullptr;
124 :
F(
F), ST(TM.getSubtarget<
GCNSubtarget>(
F)), TM(TM), TLI(TLI), AC(AC),
125 DT(DT), UA(UA),
DL(
F.getDataLayout()),
126 HasUnsafeFPMath(hasUnsafeFPMath(
F)),
136 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
146 F.getParent(), Intrinsic::ldexp,
147 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
151 bool canBreakPHINode(
const PHINode &
I);
158 unsigned getBaseElementBitWidth(
const Type *
T)
const;
175 bool needsPromotionToI32(
const Type *
T)
const;
178 bool isLegalFloatingTy(
const Type *
T)
const;
186 bool canIgnoreDenormalInput(
const Value *V,
const Instruction *CtxI)
const {
187 return HasFP32DenormalFlush ||
211 bool promoteUniformOpToI32(
ICmpInst &
I)
const;
240 unsigned numBitsUnsigned(
Value *
Op)
const;
245 unsigned numBitsSigned(
Value *
Op)
const;
259 unsigned AtLeast,
bool Signed)
const;
264 bool IsDiv,
bool IsSigned)
const;
268 bool IsDiv,
bool IsSigned)
const;
286 bool canWidenScalarExtLoad(
LoadInst &
I)
const;
301 float ReqdAccuracy)
const;
306 float ReqdAccuracy)
const;
308 std::pair<Value *, Value *> getFrexpResults(
IRBuilder<> &Builder,
312 bool IsNegative)
const;
348 if (!ExpandDiv64InIR)
357bool AMDGPUCodeGenPrepareImpl::run() {
358 BreakPhiNodesCache.clear();
359 bool MadeChange =
false;
364 NextBB = std::next(FI);
375 if (NextInstBB != BB) {
386unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(
const Type *
T)
const {
387 assert(needsPromotionToI32(
T) &&
"T does not need promotion to i32");
389 if (
T->isIntegerTy())
390 return T->getIntegerBitWidth();
391 return cast<VectorType>(
T)->getElementType()->getIntegerBitWidth();
395 assert(needsPromotionToI32(
T) &&
"T does not need promotion to i32");
397 if (
T->isIntegerTy())
398 return B.getInt32Ty();
402bool AMDGPUCodeGenPrepareImpl::isSigned(
const BinaryOperator &
I)
const {
403 return I.getOpcode() == Instruction::AShr ||
404 I.getOpcode() == Instruction::SDiv ||
I.getOpcode() == Instruction::SRem;
407bool AMDGPUCodeGenPrepareImpl::isSigned(
const SelectInst &
I)
const {
408 return isa<ICmpInst>(
I.getOperand(0)) ?
409 cast<ICmpInst>(
I.getOperand(0))->isSigned() :
false;
412bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(
const Type *
T)
const {
420 if (
const VectorType *VT = dyn_cast<VectorType>(
T)) {
423 if (
ST.hasVOP3PInsts())
426 return needsPromotionToI32(VT->getElementType());
432bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(
const Type *Ty)
const {
439 switch (
I.getOpcode()) {
440 case Instruction::Shl:
441 case Instruction::Add:
442 case Instruction::Sub:
444 case Instruction::Mul:
445 return I.hasNoUnsignedWrap();
453 switch (
I.getOpcode()) {
454 case Instruction::Shl:
455 case Instruction::Add:
456 case Instruction::Mul:
458 case Instruction::Sub:
459 return I.hasNoUnsignedWrap();
465bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(
LoadInst &
I)
const {
466 Type *Ty =
I.getType();
467 int TySize =
DL.getTypeSizeInBits(Ty);
468 Align Alignment =
DL.getValueOrABITypeAlignment(
I.getAlign(), Ty);
470 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&
I);
473bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(
BinaryOperator &
I)
const {
474 assert(needsPromotionToI32(
I.getType()) &&
475 "I does not need promotion to i32");
477 if (
I.getOpcode() == Instruction::SDiv ||
478 I.getOpcode() == Instruction::UDiv ||
479 I.getOpcode() == Instruction::SRem ||
480 I.getOpcode() == Instruction::URem)
484 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
486 Type *I32Ty = getI32Ty(Builder,
I.getType());
487 Value *ExtOp0 =
nullptr;
488 Value *ExtOp1 =
nullptr;
489 Value *ExtRes =
nullptr;
490 Value *TruncRes =
nullptr;
493 ExtOp0 = Builder.CreateSExt(
I.getOperand(0), I32Ty);
494 ExtOp1 = Builder.CreateSExt(
I.getOperand(1), I32Ty);
496 ExtOp0 = Builder.CreateZExt(
I.getOperand(0), I32Ty);
497 ExtOp1 = Builder.CreateZExt(
I.getOperand(1), I32Ty);
500 ExtRes = Builder.CreateBinOp(
I.getOpcode(), ExtOp0, ExtOp1);
501 if (
Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
503 Inst->setHasNoSignedWrap();
506 Inst->setHasNoUnsignedWrap();
508 if (
const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&
I))
509 Inst->setIsExact(ExactOp->isExact());
512 TruncRes = Builder.CreateTrunc(ExtRes,
I.getType());
514 I.replaceAllUsesWith(TruncRes);
520bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(
ICmpInst &
I)
const {
521 assert(needsPromotionToI32(
I.getOperand(0)->getType()) &&
522 "I does not need promotion to i32");
525 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
527 Type *I32Ty = getI32Ty(Builder,
I.getOperand(0)->getType());
528 Value *ExtOp0 =
nullptr;
529 Value *ExtOp1 =
nullptr;
530 Value *NewICmp =
nullptr;
533 ExtOp0 = Builder.CreateSExt(
I.getOperand(0), I32Ty);
534 ExtOp1 = Builder.CreateSExt(
I.getOperand(1), I32Ty);
536 ExtOp0 = Builder.CreateZExt(
I.getOperand(0), I32Ty);
537 ExtOp1 = Builder.CreateZExt(
I.getOperand(1), I32Ty);
539 NewICmp = Builder.CreateICmp(
I.getPredicate(), ExtOp0, ExtOp1);
541 I.replaceAllUsesWith(NewICmp);
547bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(
SelectInst &
I)
const {
548 assert(needsPromotionToI32(
I.getType()) &&
549 "I does not need promotion to i32");
552 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
554 Type *I32Ty = getI32Ty(Builder,
I.getType());
555 Value *ExtOp1 =
nullptr;
556 Value *ExtOp2 =
nullptr;
557 Value *ExtRes =
nullptr;
558 Value *TruncRes =
nullptr;
561 ExtOp1 = Builder.CreateSExt(
I.getOperand(1), I32Ty);
562 ExtOp2 = Builder.CreateSExt(
I.getOperand(2), I32Ty);
564 ExtOp1 = Builder.CreateZExt(
I.getOperand(1), I32Ty);
565 ExtOp2 = Builder.CreateZExt(
I.getOperand(2), I32Ty);
567 ExtRes = Builder.CreateSelect(
I.getOperand(0), ExtOp1, ExtOp2);
568 TruncRes = Builder.CreateTrunc(ExtRes,
I.getType());
570 I.replaceAllUsesWith(TruncRes);
576bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
578 assert(
I.getIntrinsicID() == Intrinsic::bitreverse &&
579 "I must be bitreverse intrinsic");
580 assert(needsPromotionToI32(
I.getType()) &&
581 "I does not need promotion to i32");
584 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
586 Type *I32Ty = getI32Ty(Builder,
I.getType());
587 Value *ExtOp = Builder.CreateZExt(
I.getOperand(0), I32Ty);
589 Builder.CreateIntrinsic(Intrinsic::bitreverse, {I32Ty}, {ExtOp});
591 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(
I.getType()));
593 Builder.CreateTrunc(LShrOp,
I.getType());
595 I.replaceAllUsesWith(TruncRes);
601unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(
Value *
Op)
const {
605unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(
Value *
Op)
const {
611 auto *VT = dyn_cast<FixedVectorType>(V->getType());
617 for (
int I = 0, E = VT->getNumElements();
I != E; ++
I)
630 for (
int I = 0, E = Values.
size();
I != E; ++
I)
636bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(
BinaryOperator &
I)
const {
637 if (
I.getOpcode() != Instruction::Mul)
640 Type *Ty =
I.getType();
642 if (
Size <= 16 &&
ST.has16BitInsts())
646 if (UA.isUniform(&
I))
652 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
654 unsigned LHSBits = 0, RHSBits = 0;
655 bool IsSigned =
false;
657 if (
ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
658 (RHSBits = numBitsUnsigned(RHS)) <= 24) {
661 }
else if (
ST.hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
662 (RHSBits = numBitsSigned(RHS)) <= 24) {
676 Type *DstTy = LHSVals[0]->getType();
678 for (
int I = 0, E = LHSVals.
size();
I != E; ++
I) {
679 Value *
LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[
I], I32Ty)
680 : Builder.CreateZExtOrTrunc(LHSVals[
I], I32Ty);
681 Value *
RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[
I], I32Ty)
682 : Builder.CreateZExtOrTrunc(RHSVals[
I], I32Ty);
684 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
686 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
687 : Builder.CreateZExtOrTrunc(Result, DstTy);
693 I.replaceAllUsesWith(NewVal);
703 if (
SelectInst *Sel = dyn_cast<SelectInst>(V))
706 if ((Cast = dyn_cast<CastInst>(V))) {
714bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(
BinaryOperator &BO)
const {
735 if (!CBO || !CT || !CF)
750 if (!FoldedT || isa<ConstantExpr>(FoldedT))
756 if (!FoldedF || isa<ConstantExpr>(FoldedF))
761 if (
const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
762 Builder.setFastMathFlags(FPOp->getFastMathFlags());
775std::pair<Value *, Value *>
776AMDGPUCodeGenPrepareImpl::getFrexpResults(
IRBuilder<> &Builder,
778 Type *Ty = Src->getType();
792 return {FrexpMant, FrexpExp};
798 bool IsNegative)
const {
813 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
816 return Builder.
CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
826 if (HasFP32DenormalFlush &&
ST.hasFractBug() && !
ST.hasFastFMAF32() &&
832 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
837 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
850 Type *Ty = Src->getType();
854 Builder.
CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
857 Value *InputScaleFactor =
864 Value *OutputScaleFactor =
866 return Builder.
CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
877 Type *Ty = Src->getType();
881 Builder.
CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
882 Constant *One = ConstantFP::get(Ty, 1.0);
883 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
885 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
892 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
894 return Builder.
CreateFMul(Rsq, OutputScaleFactor);
897bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(
const FPMathOperator *SqrtOp,
905 return SqrtFMF.
approxFunc() || HasUnsafeFPMath ||
909Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
918 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
924 bool IsNegative =
false;
933 canIgnoreDenormalInput(Den, CtxI)) {
960 if (
const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
961 bool IsNegative =
false;
966 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
987 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
996 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
1001 Value *Recip = emitRcpIEEE1ULP(Builder, Den,
false);
1015Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
1018 if (ReqdAccuracy < 2.5f)
1024 bool NumIsOne =
false;
1025 if (
const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
1026 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
1034 if (!HasFP32DenormalFlush && !NumIsOne)
1037 return Builder.
CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den});
1040Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
1043 float ReqdDivAccuracy)
const {
1046 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
1051 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
1059 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
1063 return emitFrexpDiv(Builder, Num, Den, DivFMF);
1082 if (DisableFDivExpand)
1101 Value *RsqOp =
nullptr;
1102 auto *DenII = dyn_cast<IntrinsicInst>(Den);
1103 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
1104 DenII->hasOneUse()) {
1105 const auto *SqrtOp = cast<FPMathOperator>(DenII);
1107 if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
1120 const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.
approxFunc();
1121 if (!RsqOp && AllowInaccurateRcp)
1125 if (ReqdAccuracy < 1.0f)
1142 for (
int I = 0, E = NumVals.
size();
I != E; ++
I) {
1143 Value *NumElt = NumVals[
I];
1144 Value *DenElt = DenVals[
I];
1145 Value *RsqDenElt = RsqOp ? RsqDenVals[
I] :
nullptr;
1148 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
1149 cast<Instruction>(FPOp), ReqdAccuracy);
1156 if (
auto *NewEltInst = dyn_cast<Instruction>(NewElt))
1157 NewEltInst->copyMetadata(FDiv);
1160 ResultVals[
I] = NewElt;
1185 return std::pair(
Lo,
Hi);
1196 Value *Den,
unsigned AtLeast,
1197 bool IsSigned)
const {
1203 if (RHSSignBits < AtLeast)
1207 if (LHSSignBits < AtLeast)
1210 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1211 unsigned DivBits = SSBits - SignBits + 1;
1227 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1228 unsigned DivBits = SSBits - SignBits;
1236 Value *Den,
bool IsDiv,
1237 bool IsSigned)
const {
1240 unsigned AtLeast = (SSBits <= 24) ? 0 : (SSBits - 24 + IsSigned);
1241 int DivBits = getDivNumBits(
I, Num, Den, AtLeast, IsSigned);
1242 if (DivBits == -1 || DivBits > 24)
1244 return expandDivRem24Impl(Builder,
I, Num, Den, DivBits, IsDiv, IsSigned);
1247Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1249 unsigned DivBits,
bool IsDiv,
bool IsSigned)
const {
1295 auto FMAD = !
ST.hasMadMacF32Insts()
1299 {FQNeg->
getType()}, {FQNeg, FB, FA}, FQ);
1327 if (DivBits != 0 && DivBits < 32) {
1330 int InRegBits = 32 - DivBits;
1332 Res = Builder.
CreateShl(Res, InRegBits);
1336 = Builder.
getInt32((UINT64_C(1) << DivBits) - 1);
1337 Res = Builder.
CreateAnd(Res, TruncMask);
1348bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(
BinaryOperator &
I,
1351 if (
Constant *
C = dyn_cast<Constant>(Den)) {
1354 if (
C->getType()->getScalarSizeInBits() <= 32)
1370 if (BinOpDen->getOpcode() == Instruction::Shl &&
1371 isa<Constant>(BinOpDen->getOperand(0)) &&
1395 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1396 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1402 if (divHasSpecialOptimization(
I,
X,
Y))
1405 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1406 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1408 Type *Ty =
X->getType();
1422 if (
Value *Res = expandDivRem24(Builder,
I,
X,
Y, IsDiv, IsSigned)) {
1430 Value *Sign =
nullptr;
1435 Sign = IsDiv ? Builder.
CreateXor(SignX, SignY) : SignX;
1478 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1518 if (!ExpandDiv64InIR && divHasSpecialOptimization(
I, Num, Den))
1523 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1524 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1526 int NumDivBits = getDivNumBits(
I, Num, Den, 32, IsSigned);
1527 if (NumDivBits == -1)
1530 Value *Narrowed =
nullptr;
1531 if (NumDivBits <= 24) {
1532 Narrowed = expandDivRem24Impl(Builder,
I, Num, Den, NumDivBits,
1534 }
else if (NumDivBits <= 32) {
1535 Narrowed = expandDivRem32(Builder,
I, Num, Den);
1546void AMDGPUCodeGenPrepareImpl::expandDivRem64(
BinaryOperator &
I)
const {
1549 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1554 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1562bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(
BinaryOperator &
I) {
1563 if (foldBinOpIntoSelect(
I))
1566 if (
ST.has16BitInsts() && needsPromotionToI32(
I.getType()) &&
1567 UA.isUniform(&
I) && promoteUniformOpToI32(
I))
1570 if (UseMul24Intrin && replaceMulWithMul24(
I))
1573 bool Changed =
false;
1575 Type *Ty =
I.getType();
1576 Value *NewDiv =
nullptr;
1581 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1582 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1584 !DisableIDivExpand) {
1585 Value *Num =
I.getOperand(0);
1586 Value *Den =
I.getOperand(1);
1590 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1593 for (
unsigned N = 0, E = VT->getNumElements();
N != E; ++
N) {
1598 if (ScalarSize <= 32) {
1599 NewElt = expandDivRem32(Builder,
I, NumEltN, DenEltN);
1601 NewElt = Builder.
CreateBinOp(Opc, NumEltN, DenEltN);
1605 NewElt = shrinkDivRem64(Builder,
I, NumEltN, DenEltN);
1610 NewElt = Builder.
CreateBinOp(Opc, NumEltN, DenEltN);
1611 Div64ToExpand.
push_back(cast<BinaryOperator>(NewElt));
1615 if (
auto *NewEltI = dyn_cast<Instruction>(NewElt))
1616 NewEltI->copyIRFlags(&
I);
1621 if (ScalarSize <= 32)
1622 NewDiv = expandDivRem32(Builder,
I, Num, Den);
1624 NewDiv = shrinkDivRem64(Builder,
I, Num, Den);
1631 I.replaceAllUsesWith(NewDiv);
1632 I.eraseFromParent();
1637 if (ExpandDiv64InIR) {
1640 expandDivRem64(*Div);
1649bool AMDGPUCodeGenPrepareImpl::visitLoadInst(
LoadInst &
I) {
1655 canWidenScalarExtLoad(
I)) {
1667 mdconst::extract<ConstantInt>(
Range->getOperand(0));
1669 if (
Lower->isNullValue()) {
1670 WidenLoad->
setMetadata(LLVMContext::MD_range,
nullptr);
1683 int TySize =
DL.getTypeSizeInBits(
I.getType());
1687 I.replaceAllUsesWith(ValOrig);
1688 I.eraseFromParent();
1695bool AMDGPUCodeGenPrepareImpl::visitICmpInst(
ICmpInst &
I) {
1696 bool Changed =
false;
1698 if (
ST.has16BitInsts() && needsPromotionToI32(
I.getOperand(0)->getType()) &&
1700 Changed |= promoteUniformOpToI32(
I);
1705bool AMDGPUCodeGenPrepareImpl::visitSelectInst(
SelectInst &
I) {
1712 if (
ST.has16BitInsts() && needsPromotionToI32(
I.getType())) {
1713 if (UA.isUniform(&
I))
1714 return promoteUniformOpToI32(
I);
1729 auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal);
1730 auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal);
1732 Value *Fract =
nullptr;
1733 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1734 CmpVal == matchFractPat(*IIFalse)) {
1736 Fract = applyFractPat(Builder, CmpVal);
1737 }
else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1738 CmpVal == matchFractPat(*IITrue)) {
1740 Fract = applyFractPat(Builder, CmpVal);
1745 I.replaceAllUsesWith(Fract);
1751 const auto *IA = dyn_cast<Instruction>(
A);
1752 const auto *IB = dyn_cast<Instruction>(
B);
1753 return IA && IB && IA->getParent() == IB->getParent();
1759 const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
1763 const Value *CurVal = V;
1766 BitVector EltsCovered(FVT->getNumElements());
1767 while (
const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
1768 const auto *
Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
1773 if (!
Idx ||
Idx->getZExtValue() >= FVT->getNumElements())
1776 const auto *VecSrc = IE->getOperand(0);
1781 if (isa<Instruction>(VecSrc) && !
areInSameBB(VecSrc, IE))
1785 EltsCovered.
set(
Idx->getZExtValue());
1788 if (EltsCovered.
all())
1797 if (isa<Constant>(CurVal))
1804 if (
const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
1805 return isa<Constant>(SV->getOperand(1)) ||
1815 const auto [It, Inserted] = SeenPHIs.
insert(&
I);
1819 for (
const Value *Inc :
I.incoming_values()) {
1820 if (
const auto *PhiInc = dyn_cast<PHINode>(Inc))
1824 for (
const User *U :
I.users()) {
1825 if (
const auto *PhiU = dyn_cast<PHINode>(U))
1830bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(
const PHINode &
I) {
1832 if (
const auto It = BreakPhiNodesCache.find(&
I);
1833 It != BreakPhiNodesCache.end())
1848 for (
const PHINode *WLP : WorkList) {
1849 assert(BreakPhiNodesCache.count(WLP) == 0);
1864 const auto Threshold = (
alignTo(WorkList.size() * 2, 3) / 3);
1865 unsigned NumBreakablePHIs = 0;
1866 bool CanBreak =
false;
1867 for (
const PHINode *Cur : WorkList) {
1875 if (++NumBreakablePHIs >= Threshold) {
1882 for (
const PHINode *Cur : WorkList)
1883 BreakPhiNodesCache[Cur] = CanBreak;
1932 Value *&Res = SlicedVals[{BB, Inc}];
1937 if (
Instruction *IncInst = dyn_cast<Instruction>(Inc))
1938 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1944 Res =
B.CreateShuffleVector(Inc, Mask, NewValName);
1946 Res =
B.CreateExtractElement(Inc,
Idx, NewValName);
1955bool AMDGPUCodeGenPrepareImpl::visitPHINode(
PHINode &
I) {
1971 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1974 if (!ForceBreakLargePHIs && !canBreakPHINode(
I))
1977 std::vector<VectorSlice> Slices;
1984 const unsigned EltSize =
DL.getTypeSizeInBits(EltTy);
1986 if (EltSize == 8 || EltSize == 16) {
1987 const unsigned SubVecSize = (32 / EltSize);
1991 Slices.emplace_back(SubVecTy,
Idx, SubVecSize);
1995 for (;
Idx < NumElts; ++
Idx)
1996 Slices.emplace_back(EltTy,
Idx, 1);
1999 assert(Slices.size() > 1);
2005 B.SetCurrentDebugLocation(
I.getDebugLoc());
2007 unsigned IncNameSuffix = 0;
2011 B.SetInsertPoint(
I.getParent()->getFirstNonPHIIt());
2012 S.NewPHI =
B.CreatePHI(S.Ty,
I.getNumIncomingValues());
2015 S.NewPHI->addIncoming(S.getSlicedVal(BB,
I.getIncomingValue(
Idx),
2016 "largephi.extractslice" +
2017 std::to_string(IncNameSuffix++)),
2024 unsigned NameSuffix = 0;
2026 const auto ValName =
"largephi.insertslice" + std::to_string(NameSuffix++);
2029 B.CreateInsertVector(FVT, Vec, S.NewPHI,
B.getInt64(S.Idx), ValName);
2031 Vec =
B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
2034 I.replaceAllUsesWith(Vec);
2035 I.eraseFromParent();
2049 if (isa<BlockAddress>(V) || isa<GlobalValue>(V) || isa<AllocaInst>(V))
2053 if (
const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
2058 if (AS != cast<PointerType>(V->getType())->getAddressSpace())
2070 const auto NullVal = TM.getNullPointerValue(AS);
2072 assert(SrcPtrKB.getBitWidth() ==
DL.getPointerSizeInBits(AS));
2073 assert((NullVal == 0 || NullVal == -1) &&
2074 "don't know how to check for this null value!");
2075 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2082 if (
I.getType()->isVectorTy())
2087 const unsigned SrcAS =
I.getSrcAddressSpace();
2088 const unsigned DstAS =
I.getDestAddressSpace();
2090 bool CanLower =
false;
2108 auto *Intrin =
B.CreateIntrinsic(
2109 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2110 I.replaceAllUsesWith(Intrin);
2111 I.eraseFromParent();
2115bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(
IntrinsicInst &
I) {
2116 switch (
I.getIntrinsicID()) {
2117 case Intrinsic::bitreverse:
2118 return visitBitreverseIntrinsicInst(
I);
2119 case Intrinsic::minnum:
2120 return visitMinNum(
I);
2121 case Intrinsic::sqrt:
2122 return visitSqrt(
I);
2128bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(
IntrinsicInst &
I) {
2129 bool Changed =
false;
2131 if (
ST.has16BitInsts() && needsPromotionToI32(
I.getType()) &&
2133 Changed |= promoteUniformBitreverseToI32(
I);
2144 if (
ST.hasFractBug())
2147 if (
I.getIntrinsicID() != Intrinsic::minnum)
2150 Type *Ty =
I.getType();
2154 Value *Arg0 =
I.getArgOperand(0);
2155 Value *Arg1 =
I.getArgOperand(1);
2163 One.convert(
C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
2172 m_Intrinsic<Intrinsic::floor>(
m_Deferred(FloorSrc)))))
2185 for (
unsigned I = 0, E = FractVals.
size();
I != E; ++
I) {
2194 Value *FractArg = matchFractPat(
I);
2200 if (!
I.hasNoNaNs() &&
2209 Value *Fract = applyFractPat(Builder, FractArg);
2211 I.replaceAllUsesWith(Fract);
2223bool AMDGPUCodeGenPrepareImpl::visitSqrt(
IntrinsicInst &Sqrt) {
2239 if (ReqdAccuracy < 1.0f)
2247 if (FDiv && FDiv->
getOpcode() == Instruction::FDiv &&
2248 FDiv->getFPAccuracy() >= 1.0f &&
2255 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2259 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2267 for (
int I = 0, E = SrcVals.
size();
I != E; ++
I) {
2269 ResultVals[
I] = Builder.
CreateCall(getSqrtF32(), SrcVals[
I]);
2271 ResultVals[
I] = emitSqrtIEEE2ULP(Builder, SrcVals[
I], SqrtFMF);
2281bool AMDGPUCodeGenPrepare::runOnFunction(
Function &
F) {
2282 if (skipFunction(
F))
2285 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2291 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
F);
2293 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
2294 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2295 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() :
nullptr;
2297 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2298 return AMDGPUCodeGenPrepareImpl(
F, TM, TLI, AC, DT, UA).run();
2308 AMDGPUCodeGenPrepareImpl Impl(
F, ATM, TLI, AC, DT, UA);
2312 if (!Impl.FlowChanged)
2318 "AMDGPU IR optimizations",
false,
false)
2325char AMDGPUCodeGenPrepare::
ID = 0;
2328 return new AMDGPUCodeGenPrepare();
static bool promotedOpIsNSW(const Instruction &I)
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static bool promotedOpIsNUW(const Instruction &I)
static bool isOneOrNegOne(const Value *Val)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Legalize the Machine IR a function s Machine IR
Generic memory optimizations
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
support::ulittle16_t & Lo
support::ulittle16_t & Hi
Helper class for "break large PHIs" (visitPHINode).
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
This class represents a conversion between pointers from one address space to another.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
BinaryOps getOpcode() const
bool all() const
all - Returns true if all bits are set.
Represents analyses that only rely on functions' control flow.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
ConstantFP - Floating Point Values [float, double].
bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Utility class for floating point operations which can have information about relaxed accuracy require...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
void setFast(bool B=true)
bool allowReciprocal() const
void setNoNaNs(bool B=true)
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
BasicBlockListType::iterator iterator
This instruction compares its operands according to the predicate given to the constructor.
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, Instruction *FMFSource=nullptr, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
RetTy visitIntrinsicInst(IntrinsicInst &I)
RetTy visitPHINode(PHINode &I)
RetTy visitAddrSpaceCastInst(AddrSpaceCastInst &I)
RetTy visitBinaryOperator(BinaryOperator &I)
RetTy visitICmpInst(ICmpInst &I)
RetTy visitSelectInst(SelectInst &I)
void visitInstruction(Instruction &I)
RetTy visitLoadInst(LoadInst &I)
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isVectorTy() const
True if this is an instance of VectorType.
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
static IntegerType * getInt32Ty(LLVMContext &C)
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
@ C
The default llvm calling convention, compatible with C.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
apfloat_match m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return true if the given value is known to have exactly one bit set when defined.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
FunctionPass * createAMDGPUCodeGenPreparePass()
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
void initializeAMDGPUCodeGenPreparePass(PassRegistry &)
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isKnownNeverNaN(const Value *V, unsigned Depth, const SimplifyQuery &SQ)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, unsigned Depth, const SimplifyQuery &SQ)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Get the upper bound on bit size for this Value Op as a signed integer.
CGPassBuilderOption getCGPassBuilderOption()
This struct is a compact representation of a valid (non-zero power of two) alignment.
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
bool isNegative() const
Returns true if this value is known to be negative.
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.