LLVM 18.0.0git
AMDGPUTargetTransformInfo.h
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file a TargetTransformInfo::Concept conforming object specific to the
11/// AMDGPU target machine. It uses the target's detailed information to
12/// provide more precise answers to certain TTI queries, while letting the
13/// target independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
18#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19
20#include "AMDGPU.h"
22#include <optional>
23
24namespace llvm {
25
26class AMDGPUTargetMachine;
27class GCNSubtarget;
28class InstCombiner;
29class Loop;
30class ScalarEvolution;
31class SITargetLowering;
32class Type;
33class Value;
34
35class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
38
39 friend BaseT;
40
41 Triple TargetTriple;
42
43 const TargetSubtargetInfo *ST;
44 const TargetLoweringBase *TLI;
45
46 const TargetSubtargetInfo *getST() const { return ST; }
47 const TargetLoweringBase *getTLI() const { return TLI; }
48
49public:
50 explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
51
55
58
60};
61
62class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
65
66 friend BaseT;
67
68 const GCNSubtarget *ST;
69 const SITargetLowering *TLI;
70 AMDGPUTTIImpl CommonTTI;
71 bool IsGraphics;
72 bool HasFP32Denormals;
73 bool HasFP64FP16Denormals;
74 static constexpr bool InlinerVectorBonusPercent = 0;
75
76 static const FeatureBitset InlineFeatureIgnoreList;
77
78 const GCNSubtarget *getST() const { return ST; }
79 const SITargetLowering *getTLI() const { return TLI; }
80
81 static inline int getFullRateInstrCost() {
83 }
84
85 static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) {
86 return CostKind == TTI::TCK_CodeSize ? 2
88 }
89
90 // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
91 // should be 2 or 4.
92 static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) {
93 return CostKind == TTI::TCK_CodeSize ? 2
95 }
96
97 // On some parts, normal fp64 operations are half rate, and others
98 // quarter. This also applies to some integer operations.
99 int get64BitInstrCost(TTI::TargetCostKind CostKind) const;
100
101 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const;
102
103public:
104 explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
105
106 bool hasBranchDivergence(const Function *F = nullptr) const;
107
111
114
116 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
118 }
119
120 unsigned getNumberOfRegisters(unsigned RCID) const;
122 unsigned getMinVectorRegisterBitWidth() const;
123 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
124 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
125 unsigned ChainSizeInBytes,
126 VectorType *VecTy) const;
127 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
128 unsigned ChainSizeInBytes,
129 VectorType *VecTy) const;
130 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
131
132 bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
133 unsigned AddrSpace) const;
134 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
135 unsigned AddrSpace) const;
136 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
137 unsigned AddrSpace) const;
138
141 LLVMContext & Context, Value * Length, unsigned SrcAddrSpace,
142 unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
143 std::optional<uint32_t> AtomicElementSize) const;
144
146 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
147 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
148 unsigned SrcAlign, unsigned DestAlign,
149 std::optional<uint32_t> AtomicCpySize) const;
151
153
155 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
157 TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
158 ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
159 const Instruction *CxtI = nullptr);
160
161 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
162 const Instruction *I = nullptr);
163
164 bool isInlineAsmSourceOfDivergence(const CallInst *CI,
165 ArrayRef<unsigned> Indices = {}) const;
166
168 InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
170 unsigned Index, Value *Op0, Value *Op1);
171
172 bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
173 bool isSourceOfDivergence(const Value *V) const;
174 bool isAlwaysUniform(const Value *V) const;
175
176 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
177 if (ToAS == AMDGPUAS::FLAT_ADDRESS) {
178 switch (FromAS) {
184 return true;
185 default:
186 break;
187 }
188 return false;
189 }
190 if ((FromAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
192 (FromAS == AMDGPUAS::CONSTANT_ADDRESS &&
194 return true;
195 return false;
196 }
197
198 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
199 return AMDGPU::addrspacesMayAlias(AS0, AS1);
200 }
201
202 unsigned getFlatAddressSpace() const {
203 // Don't bother running InferAddressSpaces pass on graphics shaders which
204 // don't use flat addressing.
205 if (IsGraphics)
206 return -1;
208 }
209
211 Intrinsic::ID IID) const;
212
216 }
217
219 Value *NewV) const;
220
221 bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
222 const Value *Op1, InstCombiner &IC) const;
223 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
224 IntrinsicInst &II) const;
225 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
226 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
227 APInt &UndefElts2, APInt &UndefElts3,
228 std::function<void(Instruction *, unsigned, APInt, APInt &)>
229 SimplifyAndSetOp) const;
230
232
234 ArrayRef<int> Mask,
236 VectorType *SubTp,
237 ArrayRef<const Value *> Args = std::nullopt);
238
239 bool areInlineCompatible(const Function *Caller,
240 const Function *Callee) const;
241
242 unsigned getInliningThresholdMultiplier() const { return 11; }
243 unsigned adjustInliningThreshold(const CallBase *CB) const;
244 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const;
245
246 int getInlinerVectorBonusPercent() const { return InlinerVectorBonusPercent; }
247
249 unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF,
251
255 FastMathFlags FMF,
257};
258
259} // end namespace llvm
260
261#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
RelocType Type
Definition: COFFYAML.cpp:391
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Machine InstCombiner
const char LLVMTargetMachineRef TM
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
int64_t getMaxMemIntrinsicInlineSizeThreshold() const
Class for arbitrary precision integers.
Definition: APInt.h:76
an instruction to allocate memory on the stack
Definition: Instructions.h:58
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Base class which can be used to help build a TTI implementation.
Definition: BasicTTIImpl.h:79
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1190
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isAlwaysUniform(const Value *V) const
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
int64_t getMaxMemIntrinsicInlineSizeThreshold() const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
unsigned getNumberOfRegisters(unsigned RCID) const
bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getMaxInterleaveFactor(ElementCount VF)
unsigned getInliningThresholdMultiplier() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
unsigned getFlatAddressSpace() const
int getInlinerVectorBonusPercent() const
InstructionCost getVectorSplitCost()
unsigned getMinVectorRegisterBitWidth() const
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional< uint32_t > AtomicCpySize) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
unsigned adjustInliningThreshold(const CallBase *CB) const
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
bool isSourceOfDivergence(const Value *V) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, const Value *Op1, InstCombiner &IC) const
bool hasBranchDivergence(const Function *F=nullptr) const
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional< uint32_t > AtomicElementSize) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
The core instruction combiner logic.
Definition: InstCombiner.h:46
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:47
The optimization diagnostic interface.
The main scalar evolution driver.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
TargetSubtargetInfo - Generic base class for all target subtargets.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
TargetCostKind
The kind of cost model.
@ TCK_CodeSize
Instruction code size.
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:400
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:385
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:379
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:382
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:381
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:377
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:378
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:383
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static bool addrspacesMayAlias(unsigned AS1, unsigned AS2)
Definition: AMDGPU.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Length
Definition: DWP.cpp:440
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:264
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Information about a load/store intrinsic defined by the target.
Parameters that control the generic loop unrolling transformation.