LLVM 20.0.0git
AMDGPUTargetTransformInfo.h
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file a TargetTransformInfo::Concept conforming object specific to the
11/// AMDGPU target machine. It uses the target's detailed information to
12/// provide more precise answers to certain TTI queries, while letting the
13/// target independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
18#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19
20#include "AMDGPU.h"
23#include <optional>
24
25namespace llvm {
26
27class AMDGPUTargetMachine;
28class GCNSubtarget;
29class InstCombiner;
30class Loop;
31class ScalarEvolution;
32class SITargetLowering;
33class Type;
34class Value;
35
36class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
39
40 friend BaseT;
41
42 Triple TargetTriple;
43
44 const TargetSubtargetInfo *ST;
45 const TargetLoweringBase *TLI;
46
47 const TargetSubtargetInfo *getST() const { return ST; }
48 const TargetLoweringBase *getTLI() const { return TLI; }
49
50public:
51 explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
52
56
59
61};
62
63class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
66
67 friend BaseT;
68
69 const GCNSubtarget *ST;
70 const SITargetLowering *TLI;
71 AMDGPUTTIImpl CommonTTI;
72 bool IsGraphics;
73 bool HasFP32Denormals;
74 bool HasFP64FP16Denormals;
75 static constexpr bool InlinerVectorBonusPercent = 0;
76
77 static const FeatureBitset InlineFeatureIgnoreList;
78
79 const GCNSubtarget *getST() const { return ST; }
80 const SITargetLowering *getTLI() const { return TLI; }
81
82 static inline int getFullRateInstrCost() {
84 }
85
86 static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) {
87 return CostKind == TTI::TCK_CodeSize ? 2
89 }
90
91 // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
92 // should be 2 or 4.
93 static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) {
94 return CostKind == TTI::TCK_CodeSize ? 2
96 }
97
98 // On some parts, normal fp64 operations are half rate, and others
99 // quarter. This also applies to some integer operations.
100 int get64BitInstrCost(TTI::TargetCostKind CostKind) const;
101
102 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const;
103
104public:
105 explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
106
107 bool hasBranchDivergence(const Function *F = nullptr) const;
108
112
115
117 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
119 }
120
121 unsigned getNumberOfRegisters(unsigned RCID) const;
123 unsigned getMinVectorRegisterBitWidth() const;
124 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
125 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
126 unsigned ChainSizeInBytes,
127 VectorType *VecTy) const;
128 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
129 unsigned ChainSizeInBytes,
130 VectorType *VecTy) const;
131 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
132
133 bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
134 unsigned AddrSpace) const;
135 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
136 unsigned AddrSpace) const;
137 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
138 unsigned AddrSpace) const;
139
141 Type *
143 unsigned SrcAddrSpace, unsigned DestAddrSpace,
144 Align SrcAlign, Align DestAlign,
145 std::optional<uint32_t> AtomicElementSize) const;
146
148 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
149 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
150 Align SrcAlign, Align DestAlign,
151 std::optional<uint32_t> AtomicCpySize) const;
153
155
157 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
159 TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
160 ArrayRef<const Value *> Args = {}, const Instruction *CxtI = nullptr);
161
162 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
163 const Instruction *I = nullptr);
164
165 bool isInlineAsmSourceOfDivergence(const CallInst *CI,
166 ArrayRef<unsigned> Indices = {}) const;
167
169 InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
171 unsigned Index, Value *Op0, Value *Op1);
172
173 bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
174 bool isSourceOfDivergence(const Value *V) const;
175 bool isAlwaysUniform(const Value *V) const;
176
177 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
178 // Address space casts must cast between different address spaces.
179 if (FromAS == ToAS)
180 return false;
181
182 if (FromAS == AMDGPUAS::FLAT_ADDRESS)
184 ToAS == AMDGPUAS::LOCAL_ADDRESS ||
186
188 return AMDGPU::isFlatGlobalAddrSpace(ToAS) ||
190
191 if (FromAS == AMDGPUAS::LOCAL_ADDRESS ||
193 return ToAS == AMDGPUAS::FLAT_ADDRESS;
194
195 return false;
196 }
197
198 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
199 return AMDGPU::addrspacesMayAlias(AS0, AS1);
200 }
201
202 unsigned getFlatAddressSpace() const {
203 // Don't bother running InferAddressSpaces pass on graphics shaders which
204 // don't use flat addressing.
205 if (IsGraphics)
206 return -1;
208 }
209
211 Intrinsic::ID IID) const;
212
216 }
217
219 Value *NewV) const;
220
221 bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
222 const Value *Op1, InstCombiner &IC) const;
223
225 unsigned LaneAgIdx) const;
226
227 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
228 IntrinsicInst &II) const;
229 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
230 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
231 APInt &UndefElts2, APInt &UndefElts3,
232 std::function<void(Instruction *, unsigned, APInt, APInt &)>
233 SimplifyAndSetOp) const;
234
236
238 ArrayRef<int> Mask,
240 VectorType *SubTp,
241 ArrayRef<const Value *> Args = {},
242 const Instruction *CxtI = nullptr);
243
244 bool isProfitableToSinkOperands(Instruction *I,
245 SmallVectorImpl<Use *> &Ops) const;
246
247 bool areInlineCompatible(const Function *Caller,
248 const Function *Callee) const;
249
251 unsigned getInliningThresholdMultiplier() const { return 11; }
252 unsigned adjustInliningThreshold(const CallBase *CB) const;
253 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const;
254
255 int getInlinerVectorBonusPercent() const { return InlinerVectorBonusPercent; }
256
258 unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF,
260
264 FastMathFlags FMF,
266
267 /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
268 unsigned getCacheLineSize() const override { return 128; }
269
270 /// How much before a load we should place the prefetch instruction.
271 /// This is currently measured in number of IR instructions.
272 unsigned getPrefetchDistance() const override;
273
274 /// \return if target want to issue a prefetch in address space \p AS.
275 bool shouldPrefetchAddressSpace(unsigned AS) const override;
276};
277
278} // end namespace llvm
279
280#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
AMDGPU address space definition.
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
RelocType Type
Definition: COFFYAML.cpp:410
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
uint32_t Index
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Machine InstCombiner
uint64_t IntrinsicInst * II
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
int64_t getMaxMemIntrinsicInlineSizeThreshold() const
Class for arbitrary precision integers.
Definition: APInt.h:78
an instruction to allocate memory on the stack
Definition: Instructions.h:63
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Base class which can be used to help build a TTI implementation.
Definition: BasicTTIImpl.h:80
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1120
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isAlwaysUniform(const Value *V) const
bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II, unsigned LaneAgIdx) const
Simplify a lane index operand (e.g.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
int64_t getMaxMemIntrinsicInlineSizeThreshold() const
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
int getInliningLastCallToStaticBonus() const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
unsigned getNumberOfRegisters(unsigned RCID) const
bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getCacheLineSize() const override
Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
bool shouldPrefetchAddressSpace(unsigned AS) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getMaxInterleaveFactor(ElementCount VF)
unsigned getInliningThresholdMultiplier() const
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
unsigned getFlatAddressSpace() const
int getInlinerVectorBonusPercent() const
InstructionCost getVectorSplitCost()
unsigned getMinVectorRegisterBitWidth() const
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
unsigned adjustInliningThreshold(const CallBase *CB) const
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
bool isSourceOfDivergence(const Value *V) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, const Value *Op1, InstCombiner &IC) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
bool hasBranchDivergence(const Function *F=nullptr) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
The core instruction combiner logic.
Definition: InstCombiner.h:48
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
The optimization diagnostic interface.
The main scalar evolution driver.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:573
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
TargetSubtargetInfo - Generic base class for all target subtargets.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
TargetCostKind
The kind of cost model.
@ TCK_CodeSize
Instruction code size.
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:427
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
bool isFlatGlobalAddrSpace(unsigned AS)
static bool addrspacesMayAlias(unsigned AS1, unsigned AS2)
Definition: AMDGPU.h:471
bool isExtendedGlobalAddrSpace(unsigned AS)
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Length
Definition: DWP.cpp:480
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Information about a load/store intrinsic defined by the target.
Parameters that control the generic loop unrolling transformation.