LLVM 17.0.0git
R600TargetTransformInfo.cpp
Go to the documentation of this file.
1//===- R600TargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// R600 target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
18#include "AMDGPU.h"
19#include "AMDGPUTargetMachine.h"
20#include "R600Subtarget.h"
21
22using namespace llvm;
23
24#define DEBUG_TYPE "R600tti"
25
27 : BaseT(TM, F.getParent()->getDataLayout()),
28 ST(static_cast<const R600Subtarget *>(TM->getSubtargetImpl(F))),
29 TLI(ST->getTargetLowering()), CommonTTI(TM, F) {}
30
32 return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
33}
34
35unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
37}
38
41 return TypeSize::getFixed(32);
42}
43
44unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const { return 32; }
45
46unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
47 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
48 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
49 return 128;
50 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
51 AddrSpace == AMDGPUAS::REGION_ADDRESS)
52 return 64;
53 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
54 return 32;
55
56 if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
57 AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
58 (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
59 AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
60 return 128;
61 llvm_unreachable("unhandled address space");
62}
63
64bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
65 Align Alignment,
66 unsigned AddrSpace) const {
67 // We allow vectorization of flat stores, even though we may need to decompose
68 // them later if they may access private memory. We don't have enough context
69 // here, and legalization can handle it.
70 return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
71}
72
73bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
74 Align Alignment,
75 unsigned AddrSpace) const {
76 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
77}
78
79bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
80 Align Alignment,
81 unsigned AddrSpace) const {
82 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
83}
84
86 // Disable unrolling if the loop is not vectorized.
87 // TODO: Enable this again.
88 if (VF.isScalar())
89 return 1;
90
91 return 8;
92}
93
96 const Instruction *I) {
98 return Opcode == Instruction::PHI ? 0 : 1;
99
100 // XXX - For some reason this isn't called for switch.
101 switch (Opcode) {
102 case Instruction::Br:
103 case Instruction::Ret:
104 return 10;
105 default:
106 return BaseT::getCFInstrCost(Opcode, CostKind, I);
107 }
108}
109
112 unsigned Index, Value *Op0,
113 Value *Op1) {
114 switch (Opcode) {
115 case Instruction::ExtractElement:
116 case Instruction::InsertElement: {
117 unsigned EltSize =
118 DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
119 if (EltSize < 32) {
120 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
121 Op1);
122 }
123
124 // Extracts are just reads of a subregister, so are free. Inserts are
125 // considered free because we don't want to have any cost for scalarizing
126 // operations, and we don't have to copy into a different register class.
127
128 // Dynamic indexing isn't free and is best avoided.
129 return Index == ~0u ? 2 : 0;
130 }
131 default:
132 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
133 }
134}
135
139 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
140}
141
144 CommonTTI.getPeelingPreferences(L, SE, PP);
145}
aarch64 promote const
The AMDGPU TargetMachine interface definition for hw codegen targets.
static const Function * getParent(const Value *V)
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
const char LLVMTargetMachineRef TM
AMDGPU R600 specific subclass of TargetSubtarget.
This file a TargetTransformInfo::Concept conforming object specific to the R600 target machine.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:673
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:302
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:47
The optimization diagnostic interface.
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getMaxInterleaveFactor(ElementCount VF)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
unsigned getHardwareNumberOfRegisters(bool Vec) const
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const
unsigned getMinVectorRegisterBitWidth() const
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
unsigned getNumberOfRegisters(bool Vec) const
The main scalar evolution driver.
TargetCostKind
The kind of cost model.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:322
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
LLVM Value Representation.
Definition: Value.h:74
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_BUFFER_15
Definition: AMDGPU.h:433
@ PARAM_D_ADDRESS
end Internal address spaces.
Definition: AMDGPU.h:408
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:390
@ CONSTANT_BUFFER_0
Definition: AMDGPU.h:418
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:393
@ PARAM_I_ADDRESS
Address space for indirect addressable parameter memory (VTX1).
Definition: AMDGPU.h:410
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:392
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:389
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:394
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
Parameters that control the generic loop unrolling transformation.