LLVM 20.0.0git
AMDGPUTargetTransformInfo.h
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.h - AMDGPU specific TTI --------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This file a TargetTransformInfo::Concept conforming object specific to the
11/// AMDGPU target machine. It uses the target's detailed information to
12/// provide more precise answers to certain TTI queries, while letting the
13/// target independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
18#define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
19
20#include "AMDGPU.h"
22#include <optional>
23
24namespace llvm {
25
26class AMDGPUTargetMachine;
27class GCNSubtarget;
28class InstCombiner;
29class Loop;
30class ScalarEvolution;
31class SITargetLowering;
32class Type;
33class Value;
34
35class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
38
39 friend BaseT;
40
41 Triple TargetTriple;
42
43 const TargetSubtargetInfo *ST;
44 const TargetLoweringBase *TLI;
45
46 const TargetSubtargetInfo *getST() const { return ST; }
47 const TargetLoweringBase *getTLI() const { return TLI; }
48
49public:
50 explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
51
55
58
60};
61
62class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
65
66 friend BaseT;
67
68 const GCNSubtarget *ST;
69 const SITargetLowering *TLI;
70 AMDGPUTTIImpl CommonTTI;
71 bool IsGraphics;
72 bool HasFP32Denormals;
73 bool HasFP64FP16Denormals;
74 static constexpr bool InlinerVectorBonusPercent = 0;
75
76 static const FeatureBitset InlineFeatureIgnoreList;
77
78 const GCNSubtarget *getST() const { return ST; }
79 const SITargetLowering *getTLI() const { return TLI; }
80
81 static inline int getFullRateInstrCost() {
83 }
84
85 static inline int getHalfRateInstrCost(TTI::TargetCostKind CostKind) {
86 return CostKind == TTI::TCK_CodeSize ? 2
88 }
89
90 // TODO: The size is usually 8 bytes, but takes 4x as many cycles. Maybe
91 // should be 2 or 4.
92 static inline int getQuarterRateInstrCost(TTI::TargetCostKind CostKind) {
93 return CostKind == TTI::TCK_CodeSize ? 2
95 }
96
97 // On some parts, normal fp64 operations are half rate, and others
98 // quarter. This also applies to some integer operations.
99 int get64BitInstrCost(TTI::TargetCostKind CostKind) const;
100
101 std::pair<InstructionCost, MVT> getTypeLegalizationCost(Type *Ty) const;
102
103public:
104 explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
105
106 bool hasBranchDivergence(const Function *F = nullptr) const;
107
111
114
116 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
118 }
119
120 unsigned getNumberOfRegisters(unsigned RCID) const;
122 unsigned getMinVectorRegisterBitWidth() const;
123 unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const;
124 unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
125 unsigned ChainSizeInBytes,
126 VectorType *VecTy) const;
127 unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize,
128 unsigned ChainSizeInBytes,
129 VectorType *VecTy) const;
130 unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
131
132 bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
133 unsigned AddrSpace) const;
134 bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
135 unsigned AddrSpace) const;
136 bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
137 unsigned AddrSpace) const;
138
140 Type *
142 unsigned SrcAddrSpace, unsigned DestAddrSpace,
143 Align SrcAlign, Align DestAlign,
144 std::optional<uint32_t> AtomicElementSize) const;
145
147 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
148 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
149 Align SrcAlign, Align DestAlign,
150 std::optional<uint32_t> AtomicCpySize) const;
152
154
156 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
158 TTI::OperandValueInfo Op2Info = {TTI::OK_AnyValue, TTI::OP_None},
159 ArrayRef<const Value *> Args = std::nullopt,
160 const Instruction *CxtI = nullptr);
161
162 InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
163 const Instruction *I = nullptr);
164
165 bool isInlineAsmSourceOfDivergence(const CallInst *CI,
166 ArrayRef<unsigned> Indices = {}) const;
167
169 InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy,
171 unsigned Index, Value *Op0, Value *Op1);
172
173 bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const;
174 bool isSourceOfDivergence(const Value *V) const;
175 bool isAlwaysUniform(const Value *V) const;
176
177 bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
178 if (ToAS == AMDGPUAS::FLAT_ADDRESS) {
179 switch (FromAS) {
185 return true;
186 default:
187 break;
188 }
189 return false;
190 }
191 if ((FromAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
193 (FromAS == AMDGPUAS::CONSTANT_ADDRESS &&
195 return true;
196 return false;
197 }
198
199 bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
200 return AMDGPU::addrspacesMayAlias(AS0, AS1);
201 }
202
203 unsigned getFlatAddressSpace() const {
204 // Don't bother running InferAddressSpaces pass on graphics shaders which
205 // don't use flat addressing.
206 if (IsGraphics)
207 return -1;
209 }
210
212 Intrinsic::ID IID) const;
213
217 }
218
220 Value *NewV) const;
221
222 bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
223 const Value *Op1, InstCombiner &IC) const;
224 std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
225 IntrinsicInst &II) const;
226 std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
227 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
228 APInt &UndefElts2, APInt &UndefElts3,
229 std::function<void(Instruction *, unsigned, APInt, APInt &)>
230 SimplifyAndSetOp) const;
231
233
235 ArrayRef<int> Mask,
237 VectorType *SubTp,
238 ArrayRef<const Value *> Args = std::nullopt,
239 const Instruction *CxtI = nullptr);
240
241 bool areInlineCompatible(const Function *Caller,
242 const Function *Callee) const;
243
244 unsigned getInliningThresholdMultiplier() const { return 11; }
245 unsigned adjustInliningThreshold(const CallBase *CB) const;
246 unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const;
247
248 int getInlinerVectorBonusPercent() const { return InlinerVectorBonusPercent; }
249
251 unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF,
253
257 FastMathFlags FMF,
259
260 /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
261 unsigned getCacheLineSize() const override { return 128; }
262
263 /// How much before a load we should place the prefetch instruction.
264 /// This is currently measured in number of IR instructions.
265 unsigned getPrefetchDistance() const override;
266
267 /// \return if target want to issue a prefetch in address space \p AS.
268 bool shouldPrefetchAddressSpace(unsigned AS) const override;
269};
270
271} // end namespace llvm
272
273#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETTRANSFORMINFO_H
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
RelocType Type
Definition: COFFYAML.cpp:391
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
Machine InstCombiner
uint64_t IntrinsicInst * II
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
int64_t getMaxMemIntrinsicInlineSizeThreshold() const
Class for arbitrary precision integers.
Definition: APInt.h:78
an instruction to allocate memory on the stack
Definition: Instructions.h:61
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
Base class which can be used to help build a TTI implementation.
Definition: BasicTTIImpl.h:81
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1236
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isAlwaysUniform(const Value *V) const
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
int64_t getMaxMemIntrinsicInlineSizeThreshold() const
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth)
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const
unsigned getNumberOfRegisters(unsigned RCID) const
bool canHaveNonUndefGlobalInitializerInAddressSpace(unsigned AS) const
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getCacheLineSize() const override
Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const
bool shouldPrefetchAddressSpace(unsigned AS) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getMaxInterleaveFactor(ElementCount VF)
unsigned getInliningThresholdMultiplier() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
unsigned getFlatAddressSpace() const
int getInlinerVectorBonusPercent() const
InstructionCost getVectorSplitCost()
unsigned getMinVectorRegisterBitWidth() const
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
unsigned adjustInliningThreshold(const CallBase *CB) const
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
bool isSourceOfDivergence(const Value *V) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, const Value *Op1, InstCombiner &IC) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=std::nullopt, const Instruction *CxtI=nullptr)
bool hasBranchDivergence(const Function *F=nullptr) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
The core instruction combiner logic.
Definition: InstCombiner.h:47
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:48
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:39
The optimization diagnostic interface.
The main scalar evolution driver.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:586
This base class for TargetLowering contains the SelectionDAG-independent parts that can be used from ...
TargetSubtargetInfo - Generic base class for all target subtargets.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
TargetCostKind
The kind of cost model.
@ TCK_CodeSize
Instruction code size.
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Triple - Helper class for working with autoconf configuration names.
Definition: Triple.h:44
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM Value Representation.
Definition: Value.h:74
Base class of all SIMD vector types.
Definition: DerivedTypes.h:403
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static bool addrspacesMayAlias(unsigned AS1, unsigned AS2)
Definition: AMDGPU.h:471
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
@ Length
Definition: DWP.cpp:480
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition: MathExtras.h:291
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Information about a load/store intrinsic defined by the target.
Parameters that control the generic loop unrolling transformation.