27#include "llvm/IR/IntrinsicsAMDGPU.h"
35#define DEBUG_TYPE "amdgpu-codegenprepare"
43 "amdgpu-codegenprepare-widen-constant-loads",
44 cl::desc(
"Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
49 "amdgpu-codegenprepare-widen-16-bit-ops",
50 cl::desc(
"Widen uniform 16-bit instructions to 32-bit in AMDGPUCodeGenPrepare"),
55 BreakLargePHIs(
"amdgpu-codegenprepare-break-large-phis",
56 cl::desc(
"Break large PHI nodes for DAGISel"),
60 ForceBreakLargePHIs(
"amdgpu-codegenprepare-force-break-large-phis",
61 cl::desc(
"For testing purposes, always break large "
62 "PHIs even if it isn't profitable."),
66 "amdgpu-codegenprepare-break-large-phis-threshold",
67 cl::desc(
"Minimum type size in bits for breaking large PHI nodes"),
71 "amdgpu-codegenprepare-mul24",
72 cl::desc(
"Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
78 "amdgpu-codegenprepare-expand-div64",
79 cl::desc(
"Expand 64-bit division in AMDGPUCodeGenPrepare"),
86 "amdgpu-codegenprepare-disable-idiv-expansion",
87 cl::desc(
"Prevent expanding integer division in AMDGPUCodeGenPrepare"),
93 "amdgpu-codegenprepare-disable-fdiv-expansion",
94 cl::desc(
"Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
98static bool hasUnsafeFPMath(
const Function &
F) {
99 return F.getFnAttribute(
"unsafe-fp-math").getValueAsBool();
102class AMDGPUCodeGenPrepareImpl
103 :
public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
113 const bool HasUnsafeFPMath;
114 const bool HasFP32DenormalFlush;
115 bool FlowChanged =
false;
116 mutable Function *SqrtF32 =
nullptr;
117 mutable Function *LdexpF32 =
nullptr;
124 :
F(
F), ST(TM.getSubtarget<
GCNSubtarget>(
F)), TM(TM), TLI(TLI), AC(AC),
125 DT(DT), UA(UA),
DL(
F.getDataLayout()),
126 HasUnsafeFPMath(hasUnsafeFPMath(
F)),
136 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
146 F.getParent(), Intrinsic::ldexp,
147 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
151 bool canBreakPHINode(
const PHINode &
I);
158 unsigned getBaseElementBitWidth(
const Type *
T)
const;
175 bool needsPromotionToI32(
const Type *
T)
const;
178 bool isLegalFloatingTy(
const Type *
T)
const;
186 bool canIgnoreDenormalInput(
const Value *V,
const Instruction *CtxI)
const {
187 return HasFP32DenormalFlush ||
211 bool promoteUniformOpToI32(
ICmpInst &
I)
const;
240 unsigned numBitsUnsigned(
Value *
Op)
const;
245 unsigned numBitsSigned(
Value *
Op)
const;
258 unsigned MaxDivBits,
bool Signed)
const;
263 bool IsDiv,
bool IsSigned)
const;
267 bool IsDiv,
bool IsSigned)
const;
285 bool canWidenScalarExtLoad(
LoadInst &
I)
const;
300 float ReqdAccuracy)
const;
305 float ReqdAccuracy)
const;
307 std::pair<Value *, Value *> getFrexpResults(
IRBuilder<> &Builder,
311 bool IsNegative)
const;
347 if (!ExpandDiv64InIR)
356bool AMDGPUCodeGenPrepareImpl::run() {
357 BreakPhiNodesCache.clear();
358 bool MadeChange =
false;
363 NextBB = std::next(FI);
374 if (NextInstBB != BB) {
385unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(
const Type *
T)
const {
386 assert(needsPromotionToI32(
T) &&
"T does not need promotion to i32");
388 if (
T->isIntegerTy())
389 return T->getIntegerBitWidth();
390 return cast<VectorType>(
T)->getElementType()->getIntegerBitWidth();
394 assert(needsPromotionToI32(
T) &&
"T does not need promotion to i32");
396 if (
T->isIntegerTy())
397 return B.getInt32Ty();
401bool AMDGPUCodeGenPrepareImpl::isSigned(
const BinaryOperator &
I)
const {
402 return I.getOpcode() == Instruction::AShr ||
403 I.getOpcode() == Instruction::SDiv ||
I.getOpcode() == Instruction::SRem;
406bool AMDGPUCodeGenPrepareImpl::isSigned(
const SelectInst &
I)
const {
407 return isa<ICmpInst>(
I.getOperand(0)) ?
408 cast<ICmpInst>(
I.getOperand(0))->isSigned() :
false;
411bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(
const Type *
T)
const {
419 if (
const VectorType *VT = dyn_cast<VectorType>(
T)) {
422 if (
ST.hasVOP3PInsts())
425 return needsPromotionToI32(VT->getElementType());
431bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(
const Type *Ty)
const {
438 switch (
I.getOpcode()) {
439 case Instruction::Shl:
440 case Instruction::Add:
441 case Instruction::Sub:
443 case Instruction::Mul:
444 return I.hasNoUnsignedWrap();
452 switch (
I.getOpcode()) {
453 case Instruction::Shl:
454 case Instruction::Add:
455 case Instruction::Mul:
457 case Instruction::Sub:
458 return I.hasNoUnsignedWrap();
464bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(
LoadInst &
I)
const {
465 Type *Ty =
I.getType();
466 int TySize =
DL.getTypeSizeInBits(Ty);
467 Align Alignment =
DL.getValueOrABITypeAlignment(
I.getAlign(), Ty);
469 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&
I);
472bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(
BinaryOperator &
I)
const {
473 assert(needsPromotionToI32(
I.getType()) &&
474 "I does not need promotion to i32");
476 if (
I.getOpcode() == Instruction::SDiv ||
477 I.getOpcode() == Instruction::UDiv ||
478 I.getOpcode() == Instruction::SRem ||
479 I.getOpcode() == Instruction::URem)
483 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
485 Type *I32Ty = getI32Ty(Builder,
I.getType());
486 Value *ExtOp0 =
nullptr;
487 Value *ExtOp1 =
nullptr;
488 Value *ExtRes =
nullptr;
489 Value *TruncRes =
nullptr;
492 ExtOp0 = Builder.CreateSExt(
I.getOperand(0), I32Ty);
493 ExtOp1 = Builder.CreateSExt(
I.getOperand(1), I32Ty);
495 ExtOp0 = Builder.CreateZExt(
I.getOperand(0), I32Ty);
496 ExtOp1 = Builder.CreateZExt(
I.getOperand(1), I32Ty);
499 ExtRes = Builder.CreateBinOp(
I.getOpcode(), ExtOp0, ExtOp1);
500 if (
Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
502 Inst->setHasNoSignedWrap();
505 Inst->setHasNoUnsignedWrap();
507 if (
const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&
I))
508 Inst->setIsExact(ExactOp->isExact());
511 TruncRes = Builder.CreateTrunc(ExtRes,
I.getType());
513 I.replaceAllUsesWith(TruncRes);
519bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(
ICmpInst &
I)
const {
520 assert(needsPromotionToI32(
I.getOperand(0)->getType()) &&
521 "I does not need promotion to i32");
524 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
526 Type *I32Ty = getI32Ty(Builder,
I.getOperand(0)->getType());
527 Value *ExtOp0 =
nullptr;
528 Value *ExtOp1 =
nullptr;
529 Value *NewICmp =
nullptr;
532 ExtOp0 = Builder.CreateSExt(
I.getOperand(0), I32Ty);
533 ExtOp1 = Builder.CreateSExt(
I.getOperand(1), I32Ty);
535 ExtOp0 = Builder.CreateZExt(
I.getOperand(0), I32Ty);
536 ExtOp1 = Builder.CreateZExt(
I.getOperand(1), I32Ty);
538 NewICmp = Builder.CreateICmp(
I.getPredicate(), ExtOp0, ExtOp1);
540 I.replaceAllUsesWith(NewICmp);
546bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(
SelectInst &
I)
const {
547 assert(needsPromotionToI32(
I.getType()) &&
548 "I does not need promotion to i32");
551 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
553 Type *I32Ty = getI32Ty(Builder,
I.getType());
554 Value *ExtOp1 =
nullptr;
555 Value *ExtOp2 =
nullptr;
556 Value *ExtRes =
nullptr;
557 Value *TruncRes =
nullptr;
560 ExtOp1 = Builder.CreateSExt(
I.getOperand(1), I32Ty);
561 ExtOp2 = Builder.CreateSExt(
I.getOperand(2), I32Ty);
563 ExtOp1 = Builder.CreateZExt(
I.getOperand(1), I32Ty);
564 ExtOp2 = Builder.CreateZExt(
I.getOperand(2), I32Ty);
566 ExtRes = Builder.CreateSelect(
I.getOperand(0), ExtOp1, ExtOp2);
567 TruncRes = Builder.CreateTrunc(ExtRes,
I.getType());
569 I.replaceAllUsesWith(TruncRes);
575bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
577 assert(
I.getIntrinsicID() == Intrinsic::bitreverse &&
578 "I must be bitreverse intrinsic");
579 assert(needsPromotionToI32(
I.getType()) &&
580 "I does not need promotion to i32");
583 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
585 Type *I32Ty = getI32Ty(Builder,
I.getType());
586 Value *ExtOp = Builder.CreateZExt(
I.getOperand(0), I32Ty);
588 Builder.CreateIntrinsic(Intrinsic::bitreverse, {I32Ty}, {ExtOp});
590 Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(
I.getType()));
592 Builder.CreateTrunc(LShrOp,
I.getType());
594 I.replaceAllUsesWith(TruncRes);
600unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(
Value *
Op)
const {
604unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(
Value *
Op)
const {
610 auto *VT = dyn_cast<FixedVectorType>(V->getType());
616 for (
int I = 0, E = VT->getNumElements();
I != E; ++
I)
629 for (
int I = 0, E = Values.
size();
I != E; ++
I)
635bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(
BinaryOperator &
I)
const {
636 if (
I.getOpcode() != Instruction::Mul)
639 Type *Ty =
I.getType();
641 if (
Size <= 16 &&
ST.has16BitInsts())
645 if (UA.isUniform(&
I))
651 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
653 unsigned LHSBits = 0, RHSBits = 0;
654 bool IsSigned =
false;
656 if (
ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
657 (RHSBits = numBitsUnsigned(RHS)) <= 24) {
660 }
else if (
ST.hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
661 (RHSBits = numBitsSigned(RHS)) <= 24) {
675 Type *DstTy = LHSVals[0]->getType();
677 for (
int I = 0, E = LHSVals.
size();
I != E; ++
I) {
678 Value *
LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[
I], I32Ty)
679 : Builder.CreateZExtOrTrunc(LHSVals[
I], I32Ty);
680 Value *
RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[
I], I32Ty)
681 : Builder.CreateZExtOrTrunc(RHSVals[
I], I32Ty);
683 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
685 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
686 : Builder.CreateZExtOrTrunc(Result, DstTy);
692 I.replaceAllUsesWith(NewVal);
702 if (
SelectInst *Sel = dyn_cast<SelectInst>(V))
705 if ((Cast = dyn_cast<CastInst>(V))) {
713bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(
BinaryOperator &BO)
const {
734 if (!CBO || !CT || !CF)
749 if (!FoldedT || isa<ConstantExpr>(FoldedT))
755 if (!FoldedF || isa<ConstantExpr>(FoldedF))
760 if (
const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
761 Builder.setFastMathFlags(FPOp->getFastMathFlags());
774std::pair<Value *, Value *>
775AMDGPUCodeGenPrepareImpl::getFrexpResults(
IRBuilder<> &Builder,
777 Type *Ty = Src->getType();
791 return {FrexpMant, FrexpExp};
797 bool IsNegative)
const {
812 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
815 return Builder.
CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
825 if (HasFP32DenormalFlush &&
ST.hasFractBug() && !
ST.hasFastFMAF32() &&
831 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
836 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
849 Type *Ty = Src->getType();
853 Builder.
CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
856 Value *InputScaleFactor =
863 Value *OutputScaleFactor =
865 return Builder.
CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
876 Type *Ty = Src->getType();
880 Builder.
CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
881 Constant *One = ConstantFP::get(Ty, 1.0);
882 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
884 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
891 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
893 return Builder.
CreateFMul(Rsq, OutputScaleFactor);
896bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(
const FPMathOperator *SqrtOp,
904 return SqrtFMF.
approxFunc() || HasUnsafeFPMath ||
908Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
917 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
923 bool IsNegative =
false;
932 canIgnoreDenormalInput(Den, CtxI)) {
959 if (
const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
960 bool IsNegative =
false;
965 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
986 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
995 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
1000 Value *Recip = emitRcpIEEE1ULP(Builder, Den,
false);
1014Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
1017 if (ReqdAccuracy < 2.5f)
1023 bool NumIsOne =
false;
1024 if (
const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
1025 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
1033 if (!HasFP32DenormalFlush && !NumIsOne)
1036 return Builder.
CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den});
1039Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
1042 float ReqdDivAccuracy)
const {
1045 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
1050 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
1058 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
1062 return emitFrexpDiv(Builder, Num, Den, DivFMF);
1081 if (DisableFDivExpand)
1100 Value *RsqOp =
nullptr;
1101 auto *DenII = dyn_cast<IntrinsicInst>(Den);
1102 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
1103 DenII->hasOneUse()) {
1104 const auto *SqrtOp = cast<FPMathOperator>(DenII);
1106 if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
1119 const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.
approxFunc();
1120 if (!RsqOp && AllowInaccurateRcp)
1124 if (ReqdAccuracy < 1.0f)
1141 for (
int I = 0, E = NumVals.
size();
I != E; ++
I) {
1142 Value *NumElt = NumVals[
I];
1143 Value *DenElt = DenVals[
I];
1144 Value *RsqDenElt = RsqOp ? RsqDenVals[
I] :
nullptr;
1147 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
1148 cast<Instruction>(FPOp), ReqdAccuracy);
1155 if (
auto *NewEltInst = dyn_cast<Instruction>(NewElt))
1156 NewEltInst->copyMetadata(FDiv);
1159 ResultVals[
I] = NewElt;
1184 return std::pair(
Lo,
Hi);
1197 unsigned MaxDivBits,
1198 bool IsSigned)
const {
1205 unsigned DivBits = SSBits - RHSSignBits + 1;
1206 if (DivBits > MaxDivBits)
1211 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1212 DivBits = SSBits - SignBits + 1;
1222 unsigned DivBits = SSBits - RHSSignBits;
1223 if (DivBits > MaxDivBits)
1231 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1232 DivBits = SSBits - SignBits;
1240 Value *Den,
bool IsDiv,
1241 bool IsSigned)
const {
1242 unsigned DivBits = getDivNumBits(
I, Num, Den, 24, IsSigned);
1245 return expandDivRem24Impl(Builder,
I, Num, Den, DivBits, IsDiv, IsSigned);
1248Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1250 unsigned DivBits,
bool IsDiv,
bool IsSigned)
const {
1296 auto FMAD = !
ST.hasMadMacF32Insts()
1300 {FQNeg->
getType()}, {FQNeg, FB, FA}, FQ);
1328 if (DivBits != 0 && DivBits < 32) {
1331 int InRegBits = 32 - DivBits;
1333 Res = Builder.
CreateShl(Res, InRegBits);
1337 = Builder.
getInt32((UINT64_C(1) << DivBits) - 1);
1338 Res = Builder.
CreateAnd(Res, TruncMask);
1349bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(
BinaryOperator &
I,
1352 if (
Constant *
C = dyn_cast<Constant>(Den)) {
1355 if (
C->getType()->getScalarSizeInBits() <= 32)
1371 if (BinOpDen->getOpcode() == Instruction::Shl &&
1372 isa<Constant>(BinOpDen->getOperand(0)) &&
1396 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1397 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1403 if (divHasSpecialOptimization(
I,
X,
Y))
1406 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1407 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1409 Type *Ty =
X->getType();
1423 if (
Value *Res = expandDivRem24(Builder,
I,
X,
Y, IsDiv, IsSigned)) {
1431 Value *Sign =
nullptr;
1436 Sign = IsDiv ? Builder.
CreateXor(SignX, SignY) : SignX;
1479 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1519 if (!ExpandDiv64InIR && divHasSpecialOptimization(
I, Num, Den))
1524 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1525 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1527 unsigned NumDivBits = getDivNumBits(
I, Num, Den, 32, IsSigned);
1528 if (NumDivBits > 32)
1531 Value *Narrowed =
nullptr;
1532 if (NumDivBits <= 24) {
1533 Narrowed = expandDivRem24Impl(Builder,
I, Num, Den, NumDivBits,
1535 }
else if (NumDivBits <= 32) {
1536 Narrowed = expandDivRem32(Builder,
I, Num, Den);
1547void AMDGPUCodeGenPrepareImpl::expandDivRem64(
BinaryOperator &
I)
const {
1550 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1555 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1563bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(
BinaryOperator &
I) {
1564 if (foldBinOpIntoSelect(
I))
1567 if (
ST.has16BitInsts() && needsPromotionToI32(
I.getType()) &&
1568 UA.isUniform(&
I) && promoteUniformOpToI32(
I))
1571 if (UseMul24Intrin && replaceMulWithMul24(
I))
1574 bool Changed =
false;
1576 Type *Ty =
I.getType();
1577 Value *NewDiv =
nullptr;
1582 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1583 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1585 !DisableIDivExpand) {
1586 Value *Num =
I.getOperand(0);
1587 Value *Den =
I.getOperand(1);
1591 if (
auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1594 for (
unsigned N = 0, E = VT->getNumElements();
N != E; ++
N) {
1599 if (ScalarSize <= 32) {
1600 NewElt = expandDivRem32(Builder,
I, NumEltN, DenEltN);
1602 NewElt = Builder.
CreateBinOp(Opc, NumEltN, DenEltN);
1606 NewElt = shrinkDivRem64(Builder,
I, NumEltN, DenEltN);
1611 NewElt = Builder.
CreateBinOp(Opc, NumEltN, DenEltN);
1612 Div64ToExpand.
push_back(cast<BinaryOperator>(NewElt));
1616 if (
auto *NewEltI = dyn_cast<Instruction>(NewElt))
1617 NewEltI->copyIRFlags(&
I);
1622 if (ScalarSize <= 32)
1623 NewDiv = expandDivRem32(Builder,
I, Num, Den);
1625 NewDiv = shrinkDivRem64(Builder,
I, Num, Den);
1632 I.replaceAllUsesWith(NewDiv);
1633 I.eraseFromParent();
1638 if (ExpandDiv64InIR) {
1641 expandDivRem64(*Div);
1650bool AMDGPUCodeGenPrepareImpl::visitLoadInst(
LoadInst &
I) {
1656 canWidenScalarExtLoad(
I)) {
1668 mdconst::extract<ConstantInt>(
Range->getOperand(0));
1670 if (
Lower->isNullValue()) {
1671 WidenLoad->
setMetadata(LLVMContext::MD_range,
nullptr);
1684 int TySize =
DL.getTypeSizeInBits(
I.getType());
1688 I.replaceAllUsesWith(ValOrig);
1689 I.eraseFromParent();
1696bool AMDGPUCodeGenPrepareImpl::visitICmpInst(
ICmpInst &
I) {
1697 bool Changed =
false;
1699 if (
ST.has16BitInsts() && needsPromotionToI32(
I.getOperand(0)->getType()) &&
1701 Changed |= promoteUniformOpToI32(
I);
1706bool AMDGPUCodeGenPrepareImpl::visitSelectInst(
SelectInst &
I) {
1713 if (
ST.has16BitInsts() && needsPromotionToI32(
I.getType())) {
1714 if (UA.isUniform(&
I))
1715 return promoteUniformOpToI32(
I);
1730 auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal);
1731 auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal);
1733 Value *Fract =
nullptr;
1734 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1735 CmpVal == matchFractPat(*IIFalse)) {
1737 Fract = applyFractPat(Builder, CmpVal);
1738 }
else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1739 CmpVal == matchFractPat(*IITrue)) {
1741 Fract = applyFractPat(Builder, CmpVal);
1746 I.replaceAllUsesWith(Fract);
1752 const auto *IA = dyn_cast<Instruction>(
A);
1753 const auto *IB = dyn_cast<Instruction>(
B);
1754 return IA && IB && IA->getParent() == IB->getParent();
1760 const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
1764 const Value *CurVal = V;
1767 BitVector EltsCovered(FVT->getNumElements());
1768 while (
const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
1769 const auto *
Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
1774 if (!
Idx ||
Idx->getZExtValue() >= FVT->getNumElements())
1777 const auto *VecSrc = IE->getOperand(0);
1782 if (isa<Instruction>(VecSrc) && !
areInSameBB(VecSrc, IE))
1786 EltsCovered.
set(
Idx->getZExtValue());
1789 if (EltsCovered.
all())
1798 if (isa<Constant>(CurVal))
1805 if (
const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
1806 return isa<Constant>(SV->getOperand(1)) ||
1816 const auto [It, Inserted] = SeenPHIs.
insert(&
I);
1820 for (
const Value *Inc :
I.incoming_values()) {
1821 if (
const auto *PhiInc = dyn_cast<PHINode>(Inc))
1825 for (
const User *U :
I.users()) {
1826 if (
const auto *PhiU = dyn_cast<PHINode>(U))
1831bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(
const PHINode &
I) {
1833 if (
const auto It = BreakPhiNodesCache.find(&
I);
1834 It != BreakPhiNodesCache.end())
1849 for (
const PHINode *WLP : WorkList) {
1850 assert(BreakPhiNodesCache.count(WLP) == 0);
1865 const auto Threshold = (
alignTo(WorkList.size() * 2, 3) / 3);
1866 unsigned NumBreakablePHIs = 0;
1867 bool CanBreak =
false;
1868 for (
const PHINode *Cur : WorkList) {
1876 if (++NumBreakablePHIs >= Threshold) {
1883 for (
const PHINode *Cur : WorkList)
1884 BreakPhiNodesCache[Cur] = CanBreak;
1933 Value *&Res = SlicedVals[{BB, Inc}];
1938 if (
Instruction *IncInst = dyn_cast<Instruction>(Inc))
1939 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1945 Res =
B.CreateShuffleVector(Inc, Mask, NewValName);
1947 Res =
B.CreateExtractElement(Inc,
Idx, NewValName);
1956bool AMDGPUCodeGenPrepareImpl::visitPHINode(
PHINode &
I) {
1972 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1975 if (!ForceBreakLargePHIs && !canBreakPHINode(
I))
1978 std::vector<VectorSlice> Slices;
1985 const unsigned EltSize =
DL.getTypeSizeInBits(EltTy);
1987 if (EltSize == 8 || EltSize == 16) {
1988 const unsigned SubVecSize = (32 / EltSize);
1992 Slices.emplace_back(SubVecTy,
Idx, SubVecSize);
1996 for (;
Idx < NumElts; ++
Idx)
1997 Slices.emplace_back(EltTy,
Idx, 1);
2000 assert(Slices.size() > 1);
2006 B.SetCurrentDebugLocation(
I.getDebugLoc());
2008 unsigned IncNameSuffix = 0;
2012 B.SetInsertPoint(
I.getParent()->getFirstNonPHIIt());
2013 S.NewPHI =
B.CreatePHI(S.Ty,
I.getNumIncomingValues());
2016 S.NewPHI->addIncoming(S.getSlicedVal(BB,
I.getIncomingValue(
Idx),
2017 "largephi.extractslice" +
2018 std::to_string(IncNameSuffix++)),
2025 unsigned NameSuffix = 0;
2027 const auto ValName =
"largephi.insertslice" + std::to_string(NameSuffix++);
2030 B.CreateInsertVector(FVT, Vec, S.NewPHI,
B.getInt64(S.Idx), ValName);
2032 Vec =
B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
2035 I.replaceAllUsesWith(Vec);
2036 I.eraseFromParent();
2050 if (isa<BlockAddress>(V) || isa<GlobalValue>(V) || isa<AllocaInst>(V))
2054 if (
const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
2059 if (AS != cast<PointerType>(V->getType())->getAddressSpace())
2071 const auto NullVal = TM.getNullPointerValue(AS);
2073 assert(SrcPtrKB.getBitWidth() ==
DL.getPointerSizeInBits(AS));
2074 assert((NullVal == 0 || NullVal == -1) &&
2075 "don't know how to check for this null value!");
2076 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
2083 if (
I.getType()->isVectorTy())
2088 const unsigned SrcAS =
I.getSrcAddressSpace();
2089 const unsigned DstAS =
I.getDestAddressSpace();
2091 bool CanLower =
false;
2109 auto *Intrin =
B.CreateIntrinsic(
2110 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2111 I.replaceAllUsesWith(Intrin);
2112 I.eraseFromParent();
2116bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(
IntrinsicInst &
I) {
2117 switch (
I.getIntrinsicID()) {
2118 case Intrinsic::bitreverse:
2119 return visitBitreverseIntrinsicInst(
I);
2120 case Intrinsic::minnum:
2121 return visitMinNum(
I);
2122 case Intrinsic::sqrt:
2123 return visitSqrt(
I);
2129bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(
IntrinsicInst &
I) {
2130 bool Changed =
false;
2132 if (
ST.has16BitInsts() && needsPromotionToI32(
I.getType()) &&
2134 Changed |= promoteUniformBitreverseToI32(
I);
2145 if (
ST.hasFractBug())
2148 if (
I.getIntrinsicID() != Intrinsic::minnum)
2151 Type *Ty =
I.getType();
2155 Value *Arg0 =
I.getArgOperand(0);
2156 Value *Arg1 =
I.getArgOperand(1);
2164 One.convert(
C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
2173 m_Intrinsic<Intrinsic::floor>(
m_Deferred(FloorSrc)))))
2186 for (
unsigned I = 0, E = FractVals.
size();
I != E; ++
I) {
2195 Value *FractArg = matchFractPat(
I);
2201 if (!
I.hasNoNaNs() &&
2210 Value *Fract = applyFractPat(Builder, FractArg);
2212 I.replaceAllUsesWith(Fract);
2224bool AMDGPUCodeGenPrepareImpl::visitSqrt(
IntrinsicInst &Sqrt) {
2240 if (ReqdAccuracy < 1.0f)
2248 if (FDiv && FDiv->
getOpcode() == Instruction::FDiv &&
2249 FDiv->getFPAccuracy() >= 1.0f &&
2256 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2260 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2268 for (
int I = 0, E = SrcVals.
size();
I != E; ++
I) {
2270 ResultVals[
I] = Builder.
CreateCall(getSqrtF32(), SrcVals[
I]);
2272 ResultVals[
I] = emitSqrtIEEE2ULP(Builder, SrcVals[
I], SqrtFMF);
2282bool AMDGPUCodeGenPrepare::runOnFunction(
Function &
F) {
2283 if (skipFunction(
F))
2286 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2292 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
F);
2294 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
2295 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2296 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() :
nullptr;
2298 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2299 return AMDGPUCodeGenPrepareImpl(
F, TM, TLI, AC, DT, UA).run();
2309 AMDGPUCodeGenPrepareImpl Impl(
F, ATM, TLI, AC, DT, UA);
2313 if (!Impl.FlowChanged)
2319 "AMDGPU IR optimizations",
false,
false)
2326char AMDGPUCodeGenPrepare::
ID = 0;
2329 return new AMDGPUCodeGenPrepare();
static bool promotedOpIsNSW(const Instruction &I)
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static bool promotedOpIsNUW(const Instruction &I)
static bool isOneOrNegOne(const Value *Val)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Returns the sub type a function will return at a given Idx Should correspond to the result type of an ExtractValue instruction executed with just that one unsigned Idx
static GCMetadataPrinterRegistry::Add< ErlangGCPrinter > X("erlang", "erlang-compatible garbage collector")
static bool isSigned(unsigned int Opcode)
Legalize the Machine IR a function s Machine IR
Generic memory optimizations
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
static GCMetadataPrinterRegistry::Add< OcamlGCMetadataPrinter > Y("ocaml", "ocaml 3.10-compatible collector")
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
support::ulittle16_t & Lo
support::ulittle16_t & Hi
Helper class for "break large PHIs" (visitPHINode).
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
This class represents a conversion between pointers from one address space to another.
A container for analyses that lazily runs them and caches their results.
PassT::Result * getCachedResult(IRUnitT &IR) const
Get the cached result of an analysis pass for a given IR unit.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
iterator begin()
Instruction iterator methods.
const Function * getParent() const
Return the enclosing method, or null if none.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
BinaryOps getOpcode() const
bool all() const
all - Returns true if all bits are set.
Represents analyses that only rely on functions' control flow.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
ConstantFP - Floating Point Values [float, double].
bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
This is an important base class in LLVM.
static Constant * getAllOnesValue(Type *Ty)
static Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Utility class for floating point operations which can have information about relaxed accuracy require...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
void setFast(bool B=true)
bool allowReciprocal() const
void setNoNaNs(bool B=true)
bool allowContract() const
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
virtual bool runOnFunction(Function &F)=0
runOnFunction - Virtual method overriden by subclasses to do the per-function processing of the pass.
BasicBlockListType::iterator iterator
This instruction compares its operands according to the predicate given to the constructor.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
RetTy visitIntrinsicInst(IntrinsicInst &I)
RetTy visitPHINode(PHINode &I)
RetTy visitAddrSpaceCastInst(AddrSpaceCastInst &I)
RetTy visitBinaryOperator(BinaryOperator &I)
RetTy visitICmpInst(ICmpInst &I)
RetTy visitSelectInst(SelectInst &I)
void visitInstruction(Instruction &I)
RetTy visitLoadInst(LoadInst &I)
void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
virtual void getAnalysisUsage(AnalysisUsage &) const
getAnalysisUsage - This function should be overriden by passes that need analysis information to do t...
virtual StringRef getPassName() const
getPassName - Return a nice clean name for a pass.
static PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
void preserveSet()
Mark an analysis set as preserved.
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
The instances of the Type class are immutable: once they are created, they are never changed.
const fltSemantics & getFltSemantics() const
bool isVectorTy() const
True if this is an instance of VectorType.
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
static IntegerType * getInt32Ty(LLVMContext &C)
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
User * getUniqueUndroppableUser()
Return true if there is exactly one unique user of this value that cannot be dropped (that user can h...
void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
@ C
The default llvm calling convention, compatible with C.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
apfloat_match m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return true if the given value is known to have exactly one bit set when defined.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
FunctionPass * createAMDGPUCodeGenPreparePass()
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
void initializeAMDGPUCodeGenPreparePass(PassRegistry &)
void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=6)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Return the number of times the sign bit of the register is replicated into the other bits.
bool isKnownNeverNaN(const Value *V, unsigned Depth, const SimplifyQuery &SQ)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, unsigned Depth, const SimplifyQuery &SQ)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr)
Get the upper bound on bit size for this Value Op as a signed integer.
CGPassBuilderOption getCGPassBuilderOption()
This struct is a compact representation of a valid (non-zero power of two) alignment.
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
bool isNegative() const
Returns true if this value is known to be negative.
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.