Go to the documentation of this file.
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
27 #define DEBUG_TYPE "AMDGPUtti"
30 "amdgpu-unroll-threshold-private",
31 cl::desc(
"Unroll threshold for AMDGPU if private memory used in a loop"),
35 "amdgpu-unroll-threshold-local",
36 cl::desc(
"Unroll threshold for AMDGPU if local memory used in a loop"),
40 "amdgpu-unroll-threshold-if",
41 cl::desc(
"Unroll threshold increment for AMDGPU for each if statement inside loop"),
45 "amdgpu-unroll-runtime-local",
46 cl::desc(
"Allow runtime unroll for AMDGPU if local memory used in a loop"),
50 "amdgpu-use-legacy-divergence-analysis",
51 cl::desc(
"Enable legacy divergence analysis for AMDGPU"),
55 "amdgpu-unroll-max-block-to-analyze",
56 cl::desc(
"Inner loop block size threshold to analyze in unroll for AMDGPU"),
61 cl::desc(
"Cost of alloca argument"));
69 cl::desc(
"Maximum alloca size to use for inline cost"));
74 cl::desc(
"Maximum number of BBs allowed in a function after inlining"
75 " (compile time constraint)"));
83 for (
const Value *V :
I->operand_values()) {
86 if (
const PHINode *PHI = dyn_cast<PHINode>(V)) {
88 return SubLoop->contains(PHI); }))
98 TargetTriple(
TM->getTargetTriple()),
100 TLI(
ST->getTargetLowering()) {}
112 const unsigned MaxAlloca = (256 - 16) * 4;
118 if (
MDNode *LoopUnrollThreshold =
120 if (LoopUnrollThreshold->getNumOperands() == 2) {
121 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
122 LoopUnrollThreshold->getOperand(1));
123 if (MetaThresholdValue) {
135 unsigned MaxBoost =
std::max(ThresholdPrivate, ThresholdLocal);
138 unsigned LocalGEPsSeen = 0;
141 return SubLoop->contains(BB); }))
150 if (
const BranchInst *Br = dyn_cast<BranchInst>(&
I)) {
151 if (UP.
Threshold < MaxBoost && Br->isConditional()) {
161 << *L <<
" due to " << *Br <<
'\n');
173 unsigned AS =
GEP->getAddressSpace();
186 const Value *Ptr =
GEP->getPointerOperand();
193 if (AllocaSize > MaxAlloca)
203 (!isa<GlobalVariable>(
GEP->getPointerOperand()) &&
204 !isa<Argument>(
GEP->getPointerOperand())))
207 << *L <<
" due to LDS use.\n");
212 bool HasLoopDef =
false;
219 return SubLoop->contains(Inst); }))
243 << *L <<
" due to " << *
GEP <<
'\n');
262 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
263 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
264 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
265 AMDGPU::FeatureUnalignedAccessMode,
267 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
270 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
271 AMDGPU::FeatureTrapHandler,
275 AMDGPU::FeatureSRAMECC,
278 AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
283 TLI(
ST->getTargetLowering()), CommonTTI(
TM,
F),
284 IsGraphics(AMDGPU::
isGraphics(
F.getCallingConv())),
288 ST->getFlatWorkGroupSizes(
F).second)))) {
290 HasFP32Denormals =
Mode.allFP32Denormals();
291 HasFP64FP16Denormals =
Mode.allFP64FP16Denormals();
323 return 32 * 4 / ElemWidth;
330 unsigned ChainSizeInBytes,
332 unsigned VecRegBitWidth = VF * LoadSize;
335 return 128 / LoadSize;
341 unsigned ChainSizeInBytes,
343 unsigned VecRegBitWidth = VF * StoreSize;
344 if (VecRegBitWidth > 128)
345 return 128 / StoreSize;
367 unsigned AddrSpace)
const {
380 unsigned AddrSpace)
const {
386 unsigned AddrSpace)
const {
398 unsigned SrcAddrSpace,
399 unsigned DestAddrSpace,
401 unsigned DestAlign)
const {
427 unsigned RemainingBytes,
unsigned SrcAddrSpace,
unsigned DestAddrSpace,
428 unsigned SrcAlign,
unsigned DestAlign)
const {
429 assert(RemainingBytes < 16);
435 while (RemainingBytes >= 8) {
436 OpsOut.push_back(I64Ty);
441 while (RemainingBytes >= 4) {
442 OpsOut.push_back(I32Ty);
448 while (RemainingBytes >= 2) {
449 OpsOut.push_back(I16Ty);
454 while (RemainingBytes) {
455 OpsOut.push_back(I8Ty);
472 case Intrinsic::amdgcn_atomic_inc:
473 case Intrinsic::amdgcn_atomic_dec:
474 case Intrinsic::amdgcn_ds_ordered_add:
475 case Intrinsic::amdgcn_ds_ordered_swap:
476 case Intrinsic::amdgcn_ds_fadd:
477 case Intrinsic::amdgcn_ds_fmin:
478 case Intrinsic::amdgcn_ds_fmax: {
479 auto *Ordering = dyn_cast<ConstantInt>(Inst->
getArgOperand(2));
480 auto *Volatile = dyn_cast<ConstantInt>(Inst->
getArgOperand(4));
481 if (!Ordering || !Volatile)
484 unsigned OrderingVal = Ordering->getZExtValue();
491 Info.WriteMem =
true;
492 Info.IsVolatile = !Volatile->isNullValue();
515 Opd1Info, Opd2Info, Opd1PropInfo,
516 Opd2PropInfo,
Args, CxtI);
521 assert(ISD &&
"Invalid opcode");
528 unsigned OpCost = (IsFloat ? 2 : 1);
533 return LT.first * OpCost;
539 return LT.first * 2 * OpCost;
545 if (
auto *VTy = dyn_cast<VectorType>(Ty)) {
546 unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
548 Opcode, VTy->getScalarType(),
CostKind, Opd1Info, Opd2Info,
549 Opd1PropInfo, Opd2PropInfo,
Args, CxtI);
566 unsigned NElts =
LT.second.isVector() ?
567 LT.second.getVectorNumElements() : 1;
576 return get64BitInstrCost(
CostKind) *
LT.first * NElts;
579 NElts = (NElts + 1) / 2;
582 return getFullRateInstrCost() *
LT.first * NElts;
590 return 2 * getFullRateInstrCost() *
LT.first * NElts;
594 NElts = (NElts + 1) / 2;
596 return LT.first * NElts * getFullRateInstrCost();
598 const int QuarterRateCost = getQuarterRateInstrCost(
CostKind);
600 const int FullRateCost = getFullRateInstrCost();
601 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) *
LT.first * NElts;
605 NElts = (NElts + 1) / 2;
608 return QuarterRateCost * NElts *
LT.first;
615 if (
const auto *
FAdd = dyn_cast<BinaryOperator>(*CxtI->
user_begin())) {
635 NElts = (NElts + 1) / 2;
637 return LT.first * NElts * get64BitInstrCost(
CostKind);
640 NElts = (NElts + 1) / 2;
643 return LT.first * NElts * getFullRateInstrCost();
650 int Cost = 7 * get64BitInstrCost(
CostKind) +
655 Cost += 3 * getFullRateInstrCost();
657 return LT.first * Cost * NElts;
662 if ((SLT ==
MVT::f32 && !HasFP32Denormals) ||
664 return LT.first * getQuarterRateInstrCost(
CostKind) * NElts;
675 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(
CostKind);
676 return LT.first * Cost * NElts;
681 int Cost = (SLT ==
MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
682 1 * getQuarterRateInstrCost(
CostKind);
684 if (!HasFP32Denormals) {
686 Cost += 2 * getFullRateInstrCost();
689 return LT.first * NElts * Cost;
701 Opd1PropInfo, Opd2PropInfo,
Args, CxtI);
711 case Intrinsic::uadd_sat:
712 case Intrinsic::usub_sat:
713 case Intrinsic::sadd_sat:
714 case Intrinsic::ssub_sat:
723 if (ICA.
getID() == Intrinsic::fabs)
740 (RetTy->
isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
752 ScalarizationCost = 0;
768 unsigned NElts =
LT.second.isVector() ?
769 LT.second.getVectorNumElements() : 1;
774 return LT.first * NElts * get64BitInstrCost(
CostKind);
778 NElts = (NElts + 1) / 2;
781 unsigned InstRate = getQuarterRateInstrCost(
CostKind);
783 switch (ICA.
getID()) {
786 : getQuarterRateInstrCost(
CostKind);
788 case Intrinsic::uadd_sat:
789 case Intrinsic::usub_sat:
790 case Intrinsic::sadd_sat:
791 case Intrinsic::ssub_sat:
798 return LT.first * NElts * InstRate;
804 return Opcode == Instruction::PHI ? 0 : 1;
808 case Instruction::Br:
829 return LT.first * getFullRateInstrCost();
833 bool IsPairwise,
bool IsUnsigned,
846 return LT.first * getHalfRateInstrCost(
CostKind);
852 case Instruction::ExtractElement:
853 case Instruction::InsertElement: {
867 return Index == ~0u ? 2 : 0;
881 if (Indices.
size() > 1)
889 const int TargetOutputIdx = Indices.
empty() ? -1 : Indices[0];
892 for (
auto &TC : TargetConstraints) {
897 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
905 TRI, TC.ConstraintCode, TC.ConstraintVT);
909 RC =
TRI->getPhysRegClass(AssignedReg);
914 if (!RC || !
TRI->isSGPRClass(RC))
929 if (
const Argument *A = dyn_cast<Argument>(V))
946 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
949 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
953 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
954 if (CI->isInlineAsm())
960 if (isa<InvokeInst>(V))
967 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
968 switch (Intrinsic->getIntrinsicID()) {
971 case Intrinsic::amdgcn_readfirstlane:
972 case Intrinsic::amdgcn_readlane:
973 case Intrinsic::amdgcn_icmp:
974 case Intrinsic::amdgcn_fcmp:
975 case Intrinsic::amdgcn_ballot:
976 case Intrinsic::amdgcn_if_break:
981 if (
const CallInst *CI = dyn_cast<CallInst>(V)) {
982 if (CI->isInlineAsm())
995 if (
const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
996 switch (Intrinsic->getIntrinsicID()) {
999 case Intrinsic::amdgcn_if:
1000 case Intrinsic::amdgcn_else: {
1002 return Indices.
size() == 1 && Indices[0] == 1;
1019 case Intrinsic::amdgcn_atomic_inc:
1020 case Intrinsic::amdgcn_atomic_dec:
1021 case Intrinsic::amdgcn_ds_fadd:
1022 case Intrinsic::amdgcn_ds_fmin:
1023 case Intrinsic::amdgcn_ds_fmax:
1024 case Intrinsic::amdgcn_is_shared:
1025 case Intrinsic::amdgcn_is_private:
1026 OpIndexes.push_back(0);
1035 Value *NewV)
const {
1038 case Intrinsic::amdgcn_atomic_inc:
1039 case Intrinsic::amdgcn_atomic_dec:
1040 case Intrinsic::amdgcn_ds_fadd:
1041 case Intrinsic::amdgcn_ds_fmin:
1042 case Intrinsic::amdgcn_ds_fmax: {
1055 case Intrinsic::amdgcn_is_shared:
1056 case Intrinsic::amdgcn_is_private: {
1057 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1065 case Intrinsic::ptrmask: {
1071 bool DoTruncate =
false;
1075 if (!
TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1093 MaskTy =
B.getInt32Ty();
1094 MaskOp =
B.CreateTrunc(MaskOp, MaskTy);
1097 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->
getType(), MaskTy},
1108 if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
1131 =
static_cast<const GCNSubtarget *
>(
TM.getSubtargetImpl(*Caller));
1135 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1136 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1138 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1139 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1140 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1150 if (
Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1151 Callee->hasFnAttribute(Attribute::InlineHint))
1159 size_t BBSize = Caller->size() +
Callee->size() - 1;
1170 uint64_t AllocaSize = 0;
1173 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1179 if (
const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
1180 if (!AI->isStaticAlloca() || !AIVisited.
insert(AI).second)
1208 ? getFullRateInstrCost()
1209 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(
CostKind)
1210 : getQuarterRateInstrCost(
CostKind);
1216 TLI(
ST->getTargetLowering()), CommonTTI(
TM,
F) {}
1254 unsigned AddrSpace)
const {
1263 unsigned AddrSpace)
const {
1269 unsigned AddrSpace)
const {
1285 return Opcode == Instruction::PHI ? 0 : 1;
1289 case Instruction::Br:
1300 case Instruction::ExtractElement:
1301 case Instruction::InsertElement: {
1313 return Index == ~0u ? 2 : 0;
Value * getUnderlyingObject(Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
This class represents an incoming formal argument to a Function.
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
This class represents lattice values for constants.
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
A parsed version of the target data layout string in and methods for querying it.
bool hasOneUse() const
Return true if there is exactly one use of this value.
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
const Function * getParent() const
Return the enclosing method, or null if none.
Represents a single loop in the control flow graph.
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
bool isSourceOfDivergence(const Value *V) const
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
bool isInlineAsm() const
Check if this call is an inline asm statement.
unsigned getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys)
Estimate the overhead of scalarizing an instructions unique non-constant operands.
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getReturnType() const
unsigned getAddressSpace() const
Return the address space of the Pointer type.
The main scalar evolution driver.
const IntrinsicInst * getInst() const
unsigned getHardwareNumberOfRegisters(bool Vec) const
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign) const
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
The instances of the Type class are immutable: once they are created, they are never changed.
unsigned getMaxInterleaveFactor(unsigned VF)
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Container class for subtarget features.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Type * getElementType() const
user_iterator user_begin()
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
@ FLAT_ADDRESS
Address space for flat memory.
Convenience struct for specifying and reasoning about fast-math flags.
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
bool hasPackedFP32Ops() const
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
static IntegerType * getInt8Ty(LLVMContext &C)
unsigned const TargetRegisterInfo * TRI
unsigned getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, bool IsPairwise, TTI::TargetCostKind CostKind)
Try to calculate arithmetic and shuffle op costs for reduction operations.
static IntegerType * getInt32Ty(LLVMContext &C)
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
const SIRegisterInfo * getRegisterInfo() const override
bool empty() const
empty - Check if the array is empty.
unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind)
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
LLVM Basic Block Representation.
FastMathFlags getFlags() const
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
const SmallVectorImpl< Type * > & getArgTypes() const
This is the shared class of boolean and integer constants.
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
bool hasFastFMAF32() const
bool match(Val *V, const Pattern &P)
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
unsigned getMaxInterleaveFactor(unsigned VF)
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
static uint64_t round(uint64_t Acc, uint64_t Input)
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
@ PARAM_D_ADDRESS
Address space for direct addressible parameter memory (CONST0).
unsigned getMinVectorRegisterBitWidth() const
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign) const
bool isVectorTy() const
True if this is an instance of VectorType.
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, bool IsPairwise, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, VectorType *SubTp)
Analysis containing CSE Info
bool isAlwaysUniform(const Value *V) const
@ AND
Bitwise operators - logical and, logical or, logical xor.
unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, VectorType *SubTp)
bool has16BitInsts() const
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
This struct is a compact representation of a valid (non-zero power of two) alignment.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
@ FADD
Simple binary floating point operators.
Base class of all SIMD vector types.
AtomicOrdering
Atomic ordering for LLVM's memory model.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
const TargetRegisterClass * getRegClass(unsigned i) const
Returns the register class associated with the enumeration value.
@ LOCAL_ADDRESS
Address space for local memory.
unsigned countMinLeadingOnes() const
Returns the minimum number of leading one bits.
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
bool hasAllowContract() const
Determine whether the allow-contract flag is set.
Module * getParent()
Get the module that this global value is contained inside of...
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
This is an important class for using LLVM in a threaded context.
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
initializer< Ty > init(const Ty &Val)
bool hasMadMacF32Insts() const
Class to represent pointers.
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
Primary interface to the complete machine description for the target machine.
unsigned getLoopDepth() const
Return the nesting level of this loop.
@ REGION_ADDRESS
Address space for region memory. (GDS)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
bool hasUnalignedScratchAccess() const
bool isVoidTy() const
Return true if this is 'void'.
A Module instance is used to store all the information related to an LLVM module.
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
unsigned getHardwareNumberOfRegisters(bool Vector) const
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind)
bool hasVOP3PInsts() const
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
SmallVector< MachineOperand, 4 > Cond
uint64_t getScalarSizeInBits() const
int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Type * getType() const
All values are typed, get the type of this value.
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
static const Function * getParent(const Value *V)
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
bool isGraphics(CallingConv::ID cc)
MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
An instruction for reading from memory.
void setArgOperand(unsigned i, Value *v)
static ConstantInt * getFalse(LLVMContext &Context)
Wrapper class representing virtual and physical registers.
amdgpu Simplify well known AMD library false FunctionCallee Callee
const TargetMachine & getTargetMachine() const
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
static IntegerType * getInt64Ty(LLVMContext &C)
unsigned getNumberOfRegisters(bool Vector) const
bool isArgPassedInSGPR(const Argument *A)
static ConstantInt * getTrue(LLVMContext &Context)
unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind)
unsigned getRegSizeInBits(const TargetRegisterClass &RC) const
Return the size in bits of a register from class RC.
bool hasFullRate64Ops() const
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
unsigned getRegisterBitWidth(bool Vector) const
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
BlockT * getHeader() const
unsigned adjustInliningThreshold(const CallBase *CB) const
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
@ ADD
Simple integer binary arithmetic operators.
A wrapper class for inspecting calls to intrinsic functions.
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index)
So we should use XX3Form_Rcr to implement instrinsic Convert DP outs ins xscvdpsp No builtin are required Round &Convert QP DP(dword[1] is set to zero) No builtin are required Round to Quad Precision because you need to assign rounding mode in instruction Provide builtin(set f128:$vT,(int_ppc_vsx_xsrqpi f128:$vB))(set f128 yields< n x< ty > >< result > yields< ty >< result > No builtin are required Load Store Vector
unsigned getMinVectorRegisterBitWidth() const
Intrinsic::ID getID() const
@ SHL
Shift and rotation operations.
unsigned getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on argument types.
Value * getArgOperand(unsigned i) const
R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
const BasicBlock * getParent() const
@ PRIVATE_ADDRESS
Address space for private memory.
unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize)
size_t size() const
size - Get the array size.
Align max(MaybeAlign Lhs, Align Rhs)
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
static cl::opt< unsigned > Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), cl::init(100), cl::Hidden)
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
Information about a load/store intrinsic defined by the target.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
static IntegerType * getInt16Ty(LLVMContext &C)
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
unsigned getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
This class represents a function call, abstracting a target machine's calling convention.
@ FNEG
Perform various unary floating-point operations inspired by libm.
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
bool useGPUDivergenceAnalysis() const
an instruction to allocate memory on the stack
Value * getOperand(unsigned i) const
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
bool isTypeBasedOnly() const
Conditional or Unconditional Branch instruction.
unsigned getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsPairwise, bool IsUnsigned, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index)
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fuse-fp-ops=xxx option.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
unsigned getNumberOfRegisters(bool Vec) const
LLVM Value Representation.
const SmallVectorImpl< const Value * > & getArgs() const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
@ PARAM_I_ADDRESS
Address space for indirect addressible parameter memory (VTX1).
std::vector< AsmOperandInfo > AsmOperandInfoVector
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
unsigned getRegisterBitWidth(bool Vector) const
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)