Go to the documentation of this file.
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
26 #define DEBUG_TYPE "amdgpu-simplifylib"
31 cl::desc(
"Enable pre-link mode optimizations"),
36 cl::desc(
"Comma separated list of functions to replace with native, or all"),
40 #define MATH_PI numbers::pi
41 #define MATH_E numbers::e
42 #define MATH_SQRT2 numbers::sqrt2
43 #define MATH_SQRT1_2 numbers::inv_sqrt2
55 bool AllNative =
false;
88 bool evaluateScalarMathFunc(
const FuncInfo &FInfo,
double& Res0,
177 "Simplify well-known AMD library calls",
false,
false)
183 "Replace builtin math
calls with
that native versions.",
186 template <typename IRB>
191 R->setCallingConv(
F->getCallingConv());
195 template <
typename IRB>
200 R->setCallingConv(
F->getCallingConv());
442 bool AMDGPULibCalls::parseFunctionName(
const StringRef &FMangledName,
448 if (
auto Op = dyn_cast<FPMathOperator>(
CI))
452 Attribute Attr =
F->getFnAttribute(
"unsafe-fp-math");
456 bool AMDGPULibCalls::useNativeFunc(
const StringRef F)
const {
461 AllNative = useNativeFunc(
"all") ||
466 bool AMDGPULibCalls::sincosUseNative(
CallInst *aCI,
const FuncInfo &FInfo) {
467 bool native_sin = useNativeFunc(
"sin");
468 bool native_cos = useNativeFunc(
"cos");
470 if (native_sin && native_cos) {
485 if (sinExpr && cosExpr) {
491 <<
" with native version of sin/cos");
505 if (!parseFunctionName(
Callee->getName(), FInfo) || !FInfo.
isMangled() ||
508 !(AllNative || useNativeFunc(FInfo.
getName()))) {
513 return sincosUseNative(aCI, FInfo);
522 <<
" with native version");
534 const FuncInfo &FInfo) {
536 if (!
Callee->isDeclaration())
539 assert(
Callee->hasName() &&
"Invalid read_pipe/write_pipe function");
541 auto &Ctx =
M->getContext();
542 std::string Name = std::string(
Callee->getName());
544 if (NumArg != 4 && NumArg != 6)
548 if (!isa<ConstantInt>(PacketSize) || !isa<ConstantInt>(PacketAlign))
550 unsigned Size = cast<ConstantInt>(PacketSize)->getZExtValue();
551 Align Alignment = cast<ConstantInt>(PacketAlign)->getAlignValue();
552 if (Alignment != Size)
566 for (
unsigned I = 0;
I != PtrArgLoc; ++
I)
568 ArgTys.push_back(PtrTy);
578 auto *BCast =
B.CreatePointerCast(PtrArg, PtrTy);
580 for (
unsigned I = 0;
I != PtrArgLoc; ++
I)
582 Args.push_back(BCast);
584 auto *NCI =
B.CreateCall(
F,
Args);
611 B.setFastMathFlags(FPOp->getFastMathFlags());
613 switch (
Callee->getIntrinsicID()) {
616 case Intrinsic::amdgcn_wavefrontsize:
621 if (!parseFunctionName(
Callee->getName(), FInfo))
628 if (TDOFold(
CI, FInfo))
638 switch (FInfo.
getId()) {
643 "recip must be an either native or half function");
650 "divide must be an either native or half function");
656 return fold_pow(
CI,
B, FInfo);
675 return fold_sincos(
CI,
B, AA);
682 return fold_read_write_pipe(
CI,
B, FInfo);
691 bool AMDGPULibCalls::TDOFold(
CallInst *CI,
const FuncInfo &FInfo) {
703 for (
int eltNo = 0; eltNo <
getVecSize(FInfo); ++eltNo) {
705 CV->getElementAsConstant((
unsigned)eltNo));
706 assert(eltval &&
"Non-FP arguments in math function!");
708 for (
int i=0;
i < sz; ++
i) {
724 for (
unsigned i = 0;
i < DVal.size(); ++
i) {
725 FVal.push_back((
float)DVal[
i]);
739 if (
ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) {
740 for (
int i = 0;
i < sz; ++
i) {
741 if (CF->isExactlyValue(tr[
i].input)) {
756 const FuncInfo &FInfo) {
758 if (
ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) {
774 const FuncInfo &FInfo) {
785 opr1,
"__div2recip");
786 Value *nval =
B.CreateFMul(opr0, nval1,
"__div2mul");
795 #if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L
804 const FuncInfo &FInfo) {
808 "fold_pow: encounter a wrong function call");
818 CZero = dyn_cast<ConstantAggregateZero>(opr1);
821 CF = dyn_cast<ConstantFP>(opr1);
822 CINT = dyn_cast<ConstantInt>(opr1);
825 assert(VTy &&
"Oprand of vector function should be of vectortype");
830 CF = CDV ? dyn_cast_or_null<ConstantFP>(CDV->
getSplatValue()) : nullptr;
831 CINT = CDV ? dyn_cast_or_null<ConstantInt>(CDV->
getSplatValue()) : nullptr;
841 if ((CF && CF->
isZero()) || (CINT && ci_opr1 == 0) || CZero) {
859 LLVM_DEBUG(
errs() <<
"AMDIC: " << *
CI <<
" ---> " << *opr0 <<
" * " << *opr0
861 Value *nval =
B.CreateFMul(opr0, opr0,
"__pow2");
865 if ((CF && CF->
isExactlyValue(-1.0)) || (CINT && ci_opr1 == -1)) {
872 Value *nval =
B.CreateFDiv(cnval, opr0,
"__powrecip");
886 << FInfo.
getName().c_str() <<
"(" << *opr0 <<
")\n");
887 Value *nval = CreateCallEx(
B,FPExpr, opr0, issqrt ?
"__pow2sqrt"
904 int ival = (
int)dval;
905 if ((
double)ival == dval) {
908 ci_opr1 = 0x11111111;
913 unsigned abs_opr1 = (ci_opr1 < 0) ? -ci_opr1 : ci_opr1;
914 if (abs_opr1 <= 12) {
924 Value *valx2 =
nullptr;
926 while (abs_opr1 > 0) {
927 valx2 = valx2 ?
B.CreateFMul(valx2, valx2,
"__powx2") : opr0;
929 nval = nval ?
B.CreateFMul(nval, valx2,
"__powprod") : valx2;
940 nval =
B.CreateFDiv(cnval, nval,
"__1powprod");
943 << ((ci_opr1 < 0) ?
"1/prod(" :
"prod(") << *opr0
956 bool needlog =
false;
957 bool needabs =
false;
958 bool needcopysign =
false;
961 CF = dyn_cast<ConstantFP>(opr0);
985 "Wrong vector size detected");
992 if (V < 0.0) needcopysign =
true;
998 for (
unsigned i=0;
i < DVal.size(); ++
i) {
999 FVal.push_back((
float)DVal[
i]);
1014 if (
const ConstantFP *CF = dyn_cast<ConstantFP>(opr1)) {
1018 if (
y != (
double)(int64_t)
y)
1028 if (
y != (
double)(int64_t)
y)
1042 nval = CreateCallEx(
B, AbsExpr, opr0,
"__fabs");
1044 nval = cnval ? cnval : opr0;
1051 nval = CreateCallEx(
B,LogExpr, nval,
"__log2");
1056 opr1 =
B.CreateSIToFP(opr1, nval->
getType(),
"pownI2F");
1058 nval =
B.CreateFMul(opr1, nval,
"__ylogx");
1059 nval = CreateCallEx(
B,ExpExpr, nval,
"__exp2");
1066 if (
const auto *vTy = dyn_cast<FixedVectorType>(rTy))
1071 opr_n =
B.CreateZExtOrBitCast(opr_n, nTy,
"__ytou");
1073 opr_n =
B.CreateFPToSI(opr1, nTy,
"__ytou");
1075 Value *sign =
B.CreateShl(opr_n,
size-1,
"__yeven");
1076 sign =
B.CreateAnd(
B.CreateBitCast(opr0, nTy), sign,
"__pow_sign");
1077 nval =
B.CreateOr(
B.CreateBitCast(nval, nTy), sign);
1078 nval =
B.CreateBitCast(nval, opr0->
getType());
1082 <<
"exp2(" << *opr1 <<
" * log2(" << *opr0 <<
"))\n");
1089 const FuncInfo &FInfo) {
1108 Value *nval = CreateCallEx(
B,FPExpr, opr0,
"__rootn2sqrt");
1112 }
else if (ci_opr1 == 3) {
1117 Value *nval = CreateCallEx(
B,FPExpr, opr0,
"__rootn2cbrt");
1121 }
else if (ci_opr1 == -1) {
1128 }
else if (ci_opr1 == -2) {
1134 Value *nval = CreateCallEx(
B,FPExpr, opr0,
"__rootn2rsqrt");
1143 const FuncInfo &FInfo) {
1148 ConstantFP *CF0 = dyn_cast<ConstantFP>(opr0);
1149 ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1);
1150 if ((CF0 && CF0->
isZero()) || (CF1 && CF1->
isZero())) {
1158 LLVM_DEBUG(
errs() <<
"AMDIC: " << *
CI <<
" ---> " << *opr1 <<
" + " << *opr2
1160 Value *nval =
B.CreateFAdd(opr1, opr2,
"fmaadd");
1166 LLVM_DEBUG(
errs() <<
"AMDIC: " << *
CI <<
" ---> " << *opr0 <<
" + " << *opr2
1168 Value *nval =
B.CreateFAdd(opr0, opr2,
"fmaadd");
1172 if (
ConstantFP *CF = dyn_cast<ConstantFP>(opr2)) {
1177 Value *nval =
B.CreateFMul(opr0, opr1,
"fmamul");
1188 const FuncInfo &FInfo) {
1191 FuncInfo nf = FInfo;
1193 return getFunction(M, nf);
1198 const FuncInfo &FInfo) {
1205 <<
"sqrt(" << *opr0 <<
")\n");
1206 Value *nval = CreateCallEx(
B,FPExpr, opr0,
"__sqrt");
1228 int const MaxScan = 30;
1229 bool Changed =
false;
1232 LoadInst *LI = dyn_cast<LoadInst>(CArgVal);
1248 std::string
const PairName = fInfo.
mangle();
1252 CallInst *XI = dyn_cast_or_null<CallInst>(U);
1264 for (
int I = MaxScan;
I > 0 && BBI != CBB->
begin(); --BBI, --
I) {
1265 if (cast<Instruction>(BBI) == XI) {
1287 AllocaInst *Alloc = insertAlloca(UI,
B,
"__sincos_");
1288 B.SetInsertPoint(UI);
1296 P =
B.CreateAddrSpaceCast(Alloc, PTy);
1299 LLVM_DEBUG(
errs() <<
"AMDIC: fold_sincos (" << *
CI <<
", " << *UI <<
") with "
1303 B.SetInsertPoint(&*ItOld);
1305 Instruction *Reload =
B.CreateLoad(Alloc->getAllocatedType(), Alloc);
1310 Instruction *Reload =
B.CreateLoad(Alloc->getAllocatedType(), Alloc);
1331 unsigned N =
ST.getWavefrontSize();
1345 assert(
BB &&
"Entry block not found!");
1352 const char *prefix) {
1356 B.SetInsertPoint(&*ItNew);
1358 B.CreateAlloca(RetType,
nullptr, std::string(prefix) + UI->
getName());
1359 Alloc->setAlignment(
1364 bool AMDGPULibCalls::evaluateScalarMathFunc(
const FuncInfo &FInfo,
1365 double& Res0,
double& Res1,
1371 double opr0=0.0, opr1=0.0, opr2=0.0;
1372 ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(copr0);
1373 ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(copr1);
1374 ConstantFP *fpopr2 = dyn_cast_or_null<ConstantFP>(copr2);
1393 switch (FInfo.getId()) {
1394 default :
return false;
1402 Res0 = log(opr0 + sqrt(opr0*opr0 - 1.0));
1415 Res0 = log(opr0 + sqrt(opr0*opr0 + 1.0));
1428 Res0 = (log(opr0 + 1.0) - log(opr0 - 1.0))/2.0;
1436 Res0 = (opr0 < 0.0) ? -pow(-opr0, 1.0/3.0) : pow(opr0, 1.0/3.0);
1456 Res0 = pow(2.0, opr0);
1460 Res0 = pow(10.0, opr0);
1464 Res0 = exp(opr0) - 1.0;
1472 Res0 = log(opr0) / log(2.0);
1476 Res0 = log(opr0) / log(10.0);
1480 Res0 = 1.0 / sqrt(opr0);
1522 Res0 = pow(opr0, opr1);
1526 if (
ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
1527 double val = (
double)iopr1->getSExtValue();
1528 Res0 = pow(opr0,
val);
1535 if (
ConstantInt *iopr1 = dyn_cast_or_null<ConstantInt>(copr1)) {
1536 double val = (
double)iopr1->getSExtValue();
1537 Res0 = pow(opr0, 1.0 /
val);
1552 Res0 = opr0 * opr1 + opr2;
1559 bool AMDGPULibCalls::evaluateCall(
CallInst *aCI,
const FuncInfo &FInfo) {
1568 if ((copr0 = dyn_cast<Constant>(aCI->
getArgOperand(0))) ==
nullptr)
1573 if ((copr1 = dyn_cast<Constant>(aCI->
getArgOperand(1))) ==
nullptr) {
1580 if ((copr2 = dyn_cast<Constant>(aCI->
getArgOperand(2))) ==
nullptr)
1587 double DVal0[16], DVal1[16];
1590 if (FuncVecSize == 1) {
1591 if (!evaluateScalarMathFunc(FInfo, DVal0[0],
1592 DVal1[0], copr0, copr1, copr2)) {
1599 for (
int i = 0;
i < FuncVecSize; ++
i) {
1603 if (!evaluateScalarMathFunc(FInfo, DVal0[
i],
1604 DVal1[
i], celt0, celt1, celt2)) {
1612 if (FuncVecSize == 1) {
1619 for (
int i = 0;
i < FuncVecSize; ++
i)
1620 FVal0.push_back((
float)DVal0[
i]);
1623 if (hasTwoResults) {
1624 for (
int i = 0;
i < FuncVecSize; ++
i)
1625 FVal1.push_back((
float)DVal1[
i]);
1632 if (hasTwoResults) {
1639 if (hasTwoResults) {
1642 "math function with ptr arg not supported yet");
1652 return new AMDGPUSimplifyLibCalls(
TM);
1656 return new AMDGPUUseNativeCalls();
1660 if (skipFunction(
F))
1663 bool Changed =
false;
1664 auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
1667 F.printAsOperand(
dbgs(),
false,
F.getParent());
dbgs() <<
'\n';);
1669 for (
auto &
BB :
F) {
1680 if (Callee ==
nullptr)
1685 if(Simplifier.
fold(CI, AA))
1697 bool Changed =
false;
1701 F.printAsOperand(
dbgs(),
false,
F.getParent());
dbgs() <<
'\n';);
1703 for (
auto &
BB :
F) {
1719 if (Simplifier.
fold(CI, AA))
1730 bool Changed =
false;
1731 for (
auto &
BB :
F) {
1740 if (Callee ==
nullptr)
1758 bool Changed =
false;
1759 for (
auto &
BB :
F) {
A set of analyses that are preserved following a run of a transformation pass.
A manager for alias analyses.
float convertToFloat() const
Converts this APFloat to host float value.
AMDGPULibCalls(const TargetMachine *TM_=nullptr)
static TableRef getOptTable(AMDGPULibFunc::EFuncId id)
static const TableEntry tbl_asin[]
static const TableEntry tbl_sinpi[]
This is an optimization pass for GlobalISel generic memory operations.
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
static const TableEntry tbl_tan[]
InstListType::iterator iterator
Instruction iterators...
const Function * getParent() const
Return the enclosing method, or null if none.
static const TableEntry tbl_acos[]
static const TableEntry tbl_log2[]
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
This currently compiles esp xmm0 movsd esp eax eax esp ret We should use not the dag combiner This is because dagcombine2 needs to be able to see through the X86ISD::Wrapper which DAGCombine can t really do The code for turning x load into a single vector load is target independent and should be moved to the dag combiner The code for turning x load into a vector load can only handle a direct load from a global or a direct load from the stack It should be generalized to handle any load from P
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
static PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
into xmm2 addss xmm2 xmm1 xmm3 addss xmm3 movaps xmm0 unpcklps xmm0 ret seems silly when it could just be one addps Expand libm rounding functions main should enable SSE DAZ mode and other fast SSE modes Think about doing i64 math in SSE regs on x86 This testcase should have no SSE instructions in and only one load from a constant double
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
static const TableEntry tbl_atanh[]
void dropAllReferences()
Drop all references to operands.
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static const TableEntry tbl_log[]
static FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
static const TableEntry tbl_log10[]
static Constant * get(LLVMContext &Context, ArrayRef< uint8_t > Elts)
get() constructors - Return a constant with vector type with an element count and element type matchi...
float getElementAsFloat(unsigned i) const
If this is an sequential container of floats, return the specified element as a float.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
static bool parse(StringRef MangledName, AMDGPULibFunc &Ptr)
bool isZero() const
Return true if the value is positive or negative zero.
The instances of the Type class are immutable: once they are created, they are never changed.
AttributeList getAttributes() const
Return the parameter attributes for this call.
All zero aggregate value.
const APFloat & getValueAPF() const
static const TableEntry tbl_cbrt[]
Type * getElementType() const
raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
urem i32 %X, 255 ret i32 %tmp1 } Currently it compiles to:... movl $2155905153, %ecx movl 8(%esp), %esi movl %esi, %eax mull %ecx ... This could be "reassociated" into:movl $2155905153, %eax movl 8(%esp), %ecx mull %ecx to avoid the copy. In fact, the existing two-address stuff would do this except that mul isn 't a commutative 2-addr instruction. I guess this has to be done at isel time based on the #uses to mul? Make sure the instruction which starts a loop does not cross a cacheline boundary. This requires knowning the exact length of each machine instruction. That is somewhat complicated, but doable. Example 256.bzip2:In the new trace, the hot loop has an instruction which crosses a cacheline boundary. In addition to potential cache misses, this can 't help decoding as I imagine there has to be some kind of complicated decoder reset and realignment to grab the bytes from the next cacheline. 532 532 0x3cfc movb(1809(%esp, %esi), %bl<<<--- spans 2 64 byte lines 942 942 0x3d03 movl %dh,(1809(%esp, %esi) 937 937 0x3d0a incl %esi 3 3 0x3d0b cmpb %bl, %dl 27 27 0x3d0d jnz 0x000062db< main+11707 > In c99 mode, the preprocessor doesn 't like assembly comments like #TRUNCATE. This could be a single 16-bit load. int f(char *p) { if((p[0]==1) &(p[1]==2)) return 1 tmp1
we should consider alternate ways to model stack dependencies Lots of things could be done in WebAssemblyTargetTransformInfo cpp there are numerous optimization related hooks that can be overridden in WebAssemblyTargetLowering Instead of the OptimizeReturned which should consider preserving the returned attribute through to MachineInstrs and extending the MemIntrinsicResults pass to do this optimization on calls too That would also let the WebAssemblyPeephole pass clean up dead defs for such as it does for stores Consider implementing and or getMachineCombinerPatterns Find a clean way to fix the problem which leads to the Shrink Wrapping pass being run after the WebAssembly PEI pass When setting multiple variables to the same we currently get code like const It could be done with a smaller encoding like local tee $pop5 local $pop6 WebAssembly registers are implicitly initialized to zero Explicit zeroing is therefore often redundant and could be optimized away Small indices may use smaller encodings than large indices WebAssemblyRegColoring and or WebAssemblyRegRenumbering should sort registers according to their usage frequency to maximize the usage of smaller encodings Many cases of irreducible control flow could be transformed more optimally than via the transform in WebAssemblyFixIrreducibleControlFlow cpp It may also be worthwhile to do transforms before register particularly when duplicating to allow register coloring to be aware of the duplication WebAssemblyRegStackify could use AliasAnalysis to reorder loads and stores more aggressively WebAssemblyRegStackify is currently a greedy algorithm This means that
alloca< 16 x float >, align 16 %tmp2=alloca< 16 x float >, align 16 store< 16 x float > %A,< 16 x float > *%tmp %s=bitcast< 16 x float > *%tmp to i8 *%s2=bitcast< 16 x float > *%tmp2 to i8 *call void @llvm.memcpy.i64(i8 *%s, i8 *%s2, i64 64, i32 16) %R=load< 16 x float > *%tmp2 ret< 16 x float > %R } declare void @llvm.memcpy.i64(i8 *nocapture, i8 *nocapture, i64, i32) nounwind which compiles to:_foo:subl $140, %esp movaps %xmm3, 112(%esp) movaps %xmm2, 96(%esp) movaps %xmm1, 80(%esp) movaps %xmm0, 64(%esp) movl 60(%esp), %eax movl %eax, 124(%esp) movl 56(%esp), %eax movl %eax, 120(%esp) movl 52(%esp), %eax< many many more 32-bit copies > movaps(%esp), %xmm0 movaps 16(%esp), %xmm1 movaps 32(%esp), %xmm2 movaps 48(%esp), %xmm3 addl $140, %esp ret On Nehalem, it may even be cheaper to just use movups when unaligned than to fall back to lower-granularity chunks. Implement processor-specific optimizations for parity with GCC on these processors. GCC does two optimizations:1. ix86_pad_returns inserts a noop before ret instructions if immediately preceded by a conditional branch or is the target of a jump. 2. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of code contains more than 3 branches. The first one is done for all AMDs, Core2, and "Generic" The second one is done for:Atom, Pentium Pro, all AMDs, Pentium 4, Nocona, Core 2, and "Generic" Testcase:int x(int a) { return(a &0xf0)> >4 tmp
bool getValueAsBool() const
Return the attribute's value as a boolean.
void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &)
static const TableEntry tbl_erf[]
bool empty() const
empty - Check if the array is empty.
Itanium Name Demangler i e convert the string _Z1fv into and both[sub] projects need to demangle but neither can depend on each other *libcxxabi needs the demangler to implement which is part of the itanium ABI spec *LLVM needs a copy for a bunch of and cannot rely on the system s __cxa_demangle because it a might not be and b may not be up to date on the latest language features The copy of the demangler in LLVM has some extra stuff that aren t needed in which depend on the shared generic components Despite these we want to keep the core generic demangling library identical between both copies to simplify development and testing If you re working on the generic library
LLVM Basic Block Representation.
It looks like we only need to define PPCfmarto for these because according to these instructions perform RTO on fma s result
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void replaceCall(Value *With)
Param * getLeads()
Get leading parameters for mangled lib functions.
amdgpu Simplify well known AMD library false FunctionCallee Value * Arg
static unsigned getEPtrKindFromAddrSpace(unsigned AS)
Wrapper class for AMDGPULIbFuncImpl.
static const TableEntry tbl_acospi[]
This is the shared class of boolean and integer constants.
#define DEBUG_WITH_TYPE(TYPE, X)
DEBUG_WITH_TYPE macro - This macro should be used by passes to emit debug information.
bool isLifetimeStartOrEnd() const LLVM_READONLY
Return true if the instruction is a llvm.lifetime.start or llvm.lifetime.end marker.
FunctionPass * createAMDGPUUseNativeCallsPass()
static PassRegistry * getPassRegistry()
getPassRegistry - Access the global registry object, which is automatically initialized at applicatio...
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static const TableEntry tbl_exp[]
bool useNative(CallInst *CI)
static FunctionCallee getOrInsertFunction(llvm::Module *M, const AMDGPULibFunc &fInfo)
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Clang compiles this i1 i64 store i64 i64 store i64 i64 store i64 i64 store i64 align Which gets codegen d xmm0 movaps rbp movaps rbp movaps rbp movaps rbp rbp rbp rbp rbp It would be better to have movq s of instead of the movaps s LLVM produces ret int
iterator begin()
Instruction iterator methods.
void initializeAMDGPUUseNativeCallsPass(PassRegistry &)
static CallInst * Create(FunctionType *Ty, Value *F, const Twine &NameStr="", Instruction *InsertBefore=nullptr)
Represent the analysis usage information of a pass.
bool contains_insensitive(StringRef Other) const
Return true if the given string is a substring of *this, and false otherwise.
static const TableEntry tbl_asinpi[]
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
double convertToDouble() const
Converts this APFloat to host double value.
Constant * getSplatValue() const
If this is a splat constant, meaning that all of the elements have the same value,...
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
ConstantFP - Floating Point Values [float, double].
bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
static Constant * get(Type *Ty, uint64_t V, bool IsSigned=false)
If Ty is a vector type, return a Constant with a splat of the given value.
FunctionType * getFunctionType()
static int getVecSize(const AMDGPULibFunc &FInfo)
static const TableEntry tbl_expm1[]
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
The initial backend is deliberately restricted to z10 We should add support for later architectures at some point If an asm ties an i32 r result to an i64 input
This struct is a compact representation of a valid (non-zero power of two) alignment.
static const TableEntry tbl_atan[]
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
static bool HasNative(AMDGPULibFunc::EFuncId id)
static const TableEntry tbl_tanh[]
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
bool equals(StringRef RHS) const
equals - Check for string equality, this is more efficient than compare() when the relative ordering ...
bool isIntegerTy() const
True if this is an instance of IntegerType.
constexpr bool empty() const
empty - Check if the string is empty.
Base class of all SIMD vector types.
The initial backend is deliberately restricted to z10 We should add support for later architectures at some point If an asm ties an i32 r result to an i64 the input will be treated as an leaving the upper bits uninitialised For i64 store i32 val
An instruction for storing to memory.
This is an important base class in LLVM.
bool isUnsafeMath(const CallInst *CI) const
SymbolTableList< Instruction >::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
Type * getReturnType() const
Returns the type of the ret val.
static const TableEntry tbl_cospi[]
Module * getParent()
Get the module that this global value is contained inside of...
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
@ PRIVATE_ADDRESS
Address space for private memory.
This is an important class for using LLVM in a threaded context.
std::string mangle() const
ArrayRef< TableEntry > TableRef
A vector constant whose element type is a simple 1/2/4/8-byte integer or float/double,...
Type * getParamType(unsigned i) const
Parameter type accessors.
static AMDGPULibFunc::EType getArgType(const AMDGPULibFunc &FInfo)
initializer< Ty > init(const Ty &Val)
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
bool is_contained(R &&Range, const E &Element)
Wrapper function around std::find to detect if an element exists in a container.
static const TableEntry tbl_asinh[]
amdgpu Simplify well known AMD library false
unsigned getNumArgs() const
bool isNegative() const
Return true if the sign bit is set.
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Utility class for floating point operations which can have information about relaxed accuracy require...
Primary interface to the complete machine description for the target machine.
Constant * getElementAsConstant(unsigned i) const
Return a Constant for a specified index's element.
A Module instance is used to store all the information related to an LLVM module.
static const TableEntry tbl_cos[]
auto size(R &&Range, std::enable_if_t< std::is_base_of< std::random_access_iterator_tag, typename std::iterator_traits< decltype(Range.begin())>::iterator_category >::value, void > *=nullptr)
Get the size of a range.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
StringRef - Represent a constant reference to a string, i.e.
static const TableEntry tbl_erfc[]
unsigned getNumUses() const
This method computes the number of uses of this Value.
Type * getType() const
All values are typed, get the type of this value.
static const TableEntry tbl_atanpi[]
void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
static const TableEntry tbl_tanpi[]
self_iterator getIterator()
StringRef getTargetFeatureString() const
void setPrefix(ENamePrefix PFX)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
StringRef getName() const
Return a constant reference to the value's name.
An instruction for reading from memory.
static cl::list< std::string > UseNative("amdgpu-use-native", cl::desc("Comma separated list of functions to replace with native, or all"), cl::CommaSeparated, cl::ValueOptional, cl::Hidden)
llvm ldr ldrb ldrh str strh strb strb gcc and possibly speed as well(we don 't have a good way to measure on ARM). *Consider this silly example
amdgpu Simplify well known AMD library false FunctionCallee Callee
LLVMContext & getContext() const
Get the context in which this basic block lives.
static bool runOnFunction(Function &F, bool PostInlining)
static cl::opt< bool > EnablePreLink("amdgpu-prelink", cl::desc("Enable pre-link mode optimizations"), cl::init(false), cl::Hidden)
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static IntegerType * getInt64Ty(LLVMContext &C)
INITIALIZE_PASS_BEGIN(AMDGPUSimplifyLibCalls, "amdgpu-simplifylib", "Simplify well-known AMD library calls", false, false) INITIALIZE_PASS_END(AMDGPUSimplifyLibCalls
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
double getElementAsDouble(unsigned i) const
If this is an sequential container of doubles, return the specified element as a double.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
unsigned arg_size() const
static Constant * get(Type *Ty, double V)
This returns a ConstantFP, or a vector containing a splat of a ConstantFP, for the specified value in...
ENamePrefix getPrefix() const
Value * FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB, BasicBlock::iterator &ScanFrom, unsigned MaxInstsToScan=DefMaxInstsToScan, AAResults *AA=nullptr, bool *IsLoadCSE=nullptr, unsigned *NumScanedInst=nullptr)
Scan backwards to see if we have the value of the given load available locally within a small number ...
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
into llvm powi allowing the code generator to produce balanced multiplication trees the intrinsic needs to be extended to support and second the code generator needs to be enhanced to lower these to multiplication trees Interesting testcase for add shift mul int y
static double log2(double V)
static const TableEntry tbl_rsqrt[]
bool equals_insensitive(StringRef RHS) const
Check for string equality, ignoring case.
StringRef getTargetCPU() const
static const TableEntry tbl_exp10[]
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
bool fold(CallInst *CI, AliasAnalysis *AA=nullptr)
static const TableEntry tbl_sin[]
Value * getArgOperand(unsigned i) const
static Constant * getSplat(unsigned NumElts, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static const TableEntry tbl_exp2[]
A wrapper pass to provide the legacy pass manager access to a suitably prepared AAResults object.
std::string getName() const
Get unmangled name for mangled library function and name for unmangled library function.
const BasicBlock * getParent() const
std::string to_string(const T &Value)
size_t size() const
size - Get the array size.
static const TableEntry tbl_sinh[]
@ FLAT_ADDRESS
Address space for flat memory.
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
amdgpu Simplify well known AMD library calls
static const TableEntry tbl_cosh[]
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
A container for analyses that lazily runs them and caches their results.
const char LLVMTargetMachineRef TM
FunctionPass class - This class is used to implement most global optimizations.
This class represents a function call, abstracting a target machine's calling convention.
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
static const TableEntry tbl_sqrt[]
AnalysisUsage & addRequired()
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
an instruction to allocate memory on the stack
APFloat abs(APFloat X)
Returns the absolute value of the argument.
LLVM Value Representation.
iterator_range< user_iterator > users()
static const TableEntry tbl_acosh[]
static const TableEntry tbl_tgamma[]
FunctionPass * createAMDGPUSimplifyLibCallsPass(const TargetMachine *)
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
unsigned getNumElements() const
Return the number of elements in the array or vector.