LLVM  4.0.0
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===//
2 //
3 // The LLVM Compiler Infrastructure
4 //
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
7 //
8 //===----------------------------------------------------------------------===//
9 //
10 // \file
11 // This file implements a TargetTransformInfo analysis pass specific to the
12 // AMDGPU target machine. It uses the target's detailed information to provide
13 // more precise answers to certain TTI queries, while letting the target
14 // independent and default TTI implementations handle the rest.
15 //
16 //===----------------------------------------------------------------------===//
17 
19 #include "llvm/Analysis/LoopInfo.h"
23 #include "llvm/IR/Module.h"
24 #include "llvm/IR/Intrinsics.h"
25 #include "llvm/Support/Debug.h"
26 #include "llvm/Target/CostTable.h"
28 using namespace llvm;
29 
30 #define DEBUG_TYPE "AMDGPUtti"
31 
32 
35  UP.Threshold = 300; // Twice the default.
36  UP.MaxCount = UINT_MAX;
37  UP.Partial = true;
38 
39  // TODO: Do we want runtime unrolling?
40 
41  for (const BasicBlock *BB : L->getBlocks()) {
42  const DataLayout &DL = BB->getModule()->getDataLayout();
43  for (const Instruction &I : *BB) {
45  if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
46  continue;
47 
48  const Value *Ptr = GEP->getPointerOperand();
49  const AllocaInst *Alloca =
51  if (Alloca) {
52  // We want to do whatever we can to limit the number of alloca
53  // instructions that make it through to the code generator. allocas
54  // require us to use indirect addressing, which is slow and prone to
55  // compiler bugs. If this loop does an address calculation on an
56  // alloca ptr, then we want to use a higher than normal loop unroll
57  // threshold. This will give SROA a better chance to eliminate these
58  // allocas.
59  //
60  // Don't use the maximum allowed value here as it will make some
61  // programs way too big.
62  UP.Threshold = 800;
63  }
64  }
65  }
66 }
67 
69  if (Vec)
70  return 0;
71 
72  // Number of VGPRs on SI.
74  return 256;
75 
76  return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
77 }
78 
79 unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) {
80  return Vector ? 0 : 32;
81 }
82 
83 unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
84  switch (AddrSpace) {
88  return 128;
91  return 64;
93  return 8 * ST->getMaxPrivateElementSize();
94  default:
96  (AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
97  AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
98  (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
99  AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
100  return 128;
101  llvm_unreachable("unhandled address space");
102  }
103 }
104 
106  // Semi-arbitrary large amount.
107  return 64;
108 }
109 
111  unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
112  TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
114 
115  EVT OrigTy = TLI->getValueType(DL, Ty);
116  if (!OrigTy.isSimple()) {
117  return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
118  Opd1PropInfo, Opd2PropInfo);
119  }
120 
121  // Legalize the type.
122  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
123  int ISD = TLI->InstructionOpcodeToISD(Opcode);
124 
125  // Because we don't have any legal vector operations, but the legal types, we
126  // need to account for split vectors.
127  unsigned NElts = LT.second.isVector() ?
128  LT.second.getVectorNumElements() : 1;
129 
130  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
131 
132  switch (ISD) {
133  case ISD::SHL:
134  case ISD::SRL:
135  case ISD::SRA: {
136  if (SLT == MVT::i64)
137  return get64BitInstrCost() * LT.first * NElts;
138 
139  // i32
140  return getFullRateInstrCost() * LT.first * NElts;
141  }
142  case ISD::ADD:
143  case ISD::SUB:
144  case ISD::AND:
145  case ISD::OR:
146  case ISD::XOR: {
147  if (SLT == MVT::i64){
148  // and, or and xor are typically split into 2 VALU instructions.
149  return 2 * getFullRateInstrCost() * LT.first * NElts;
150  }
151 
152  return LT.first * NElts * getFullRateInstrCost();
153  }
154  case ISD::MUL: {
155  const int QuarterRateCost = getQuarterRateInstrCost();
156  if (SLT == MVT::i64) {
157  const int FullRateCost = getFullRateInstrCost();
158  return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
159  }
160 
161  // i32
162  return QuarterRateCost * NElts * LT.first;
163  }
164  case ISD::FADD:
165  case ISD::FSUB:
166  case ISD::FMUL:
167  if (SLT == MVT::f64)
168  return LT.first * NElts * get64BitInstrCost();
169 
170  if (SLT == MVT::f32 || SLT == MVT::f16)
171  return LT.first * NElts * getFullRateInstrCost();
172  break;
173 
174  case ISD::FDIV:
175  case ISD::FREM:
176  // FIXME: frem should be handled separately. The fdiv in it is most of it,
177  // but the current lowering is also not entirely correct.
178  if (SLT == MVT::f64) {
179  int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
180 
181  // Add cost of workaround.
183  Cost += 3 * getFullRateInstrCost();
184 
185  return LT.first * Cost * NElts;
186  }
187 
188  // Assuming no fp32 denormals lowering.
189  if (SLT == MVT::f32 || SLT == MVT::f16) {
190  assert(!ST->hasFP32Denormals() && "will change when supported");
191  int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
192  return LT.first * NElts * Cost;
193  }
194 
195  break;
196  default:
197  break;
198  }
199 
200  return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
201  Opd1PropInfo, Opd2PropInfo);
202 }
203 
204 unsigned AMDGPUTTIImpl::getCFInstrCost(unsigned Opcode) {
205  // XXX - For some reason this isn't called for switch.
206  switch (Opcode) {
207  case Instruction::Br:
208  case Instruction::Ret:
209  return 10;
210  default:
211  return BaseT::getCFInstrCost(Opcode);
212  }
213 }
214 
215 int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
216  unsigned Index) {
217  switch (Opcode) {
218  case Instruction::ExtractElement:
219  case Instruction::InsertElement:
220  // Extracts are just reads of a subregister, so are free. Inserts are
221  // considered free because we don't want to have any cost for scalarizing
222  // operations, and we don't have to copy into a different register class.
223 
224  // Dynamic indexing isn't free and is best avoided.
225  return Index == ~0u ? 2 : 0;
226  default:
227  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
228  }
229 }
230 
232  const IntrinsicInst *I) {
233  switch (I->getIntrinsicID()) {
234  default:
235  return false;
237  // This means we have an intrinsic that isn't defined in
238  // IntrinsicsAMDGPU.td
239  break;
240 
241  case Intrinsic::amdgcn_workitem_id_x:
242  case Intrinsic::amdgcn_workitem_id_y:
243  case Intrinsic::amdgcn_workitem_id_z:
244  case Intrinsic::amdgcn_interp_mov:
245  case Intrinsic::amdgcn_interp_p1:
246  case Intrinsic::amdgcn_interp_p2:
247  case Intrinsic::amdgcn_mbcnt_hi:
248  case Intrinsic::amdgcn_mbcnt_lo:
249  case Intrinsic::r600_read_tidig_x:
250  case Intrinsic::r600_read_tidig_y:
251  case Intrinsic::r600_read_tidig_z:
252  case Intrinsic::amdgcn_image_atomic_swap:
253  case Intrinsic::amdgcn_image_atomic_add:
254  case Intrinsic::amdgcn_image_atomic_sub:
255  case Intrinsic::amdgcn_image_atomic_smin:
256  case Intrinsic::amdgcn_image_atomic_umin:
257  case Intrinsic::amdgcn_image_atomic_smax:
258  case Intrinsic::amdgcn_image_atomic_umax:
259  case Intrinsic::amdgcn_image_atomic_and:
260  case Intrinsic::amdgcn_image_atomic_or:
261  case Intrinsic::amdgcn_image_atomic_xor:
262  case Intrinsic::amdgcn_image_atomic_inc:
263  case Intrinsic::amdgcn_image_atomic_dec:
264  case Intrinsic::amdgcn_image_atomic_cmpswap:
265  case Intrinsic::amdgcn_buffer_atomic_swap:
266  case Intrinsic::amdgcn_buffer_atomic_add:
267  case Intrinsic::amdgcn_buffer_atomic_sub:
268  case Intrinsic::amdgcn_buffer_atomic_smin:
269  case Intrinsic::amdgcn_buffer_atomic_umin:
270  case Intrinsic::amdgcn_buffer_atomic_smax:
271  case Intrinsic::amdgcn_buffer_atomic_umax:
272  case Intrinsic::amdgcn_buffer_atomic_and:
273  case Intrinsic::amdgcn_buffer_atomic_or:
274  case Intrinsic::amdgcn_buffer_atomic_xor:
275  case Intrinsic::amdgcn_buffer_atomic_cmpswap:
276  case Intrinsic::amdgcn_ps_live:
277  return true;
278  }
279 
281  switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) {
282  default:
283  return false;
284  case AMDGPUIntrinsic::SI_fs_interp:
285  case AMDGPUIntrinsic::SI_fs_constant:
286  return true;
287  }
288 }
289 
290 static bool isArgPassedInSGPR(const Argument *A) {
291  const Function *F = A->getParent();
292 
293  // Arguments to compute shaders are never a source of divergence.
294  if (!AMDGPU::isShader(F->getCallingConv()))
295  return true;
296 
297  // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
298  if (F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::InReg) ||
299  F->getAttributes().hasAttribute(A->getArgNo() + 1, Attribute::ByVal))
300  return true;
301 
302  // Everything else is in VGPRs.
303  return false;
304 }
305 
306 ///
307 /// \returns true if the result of the value could potentially be
308 /// different across workitems in a wavefront.
310 
311  if (const Argument *A = dyn_cast<Argument>(V))
312  return !isArgPassedInSGPR(A);
313 
314  // Loads from the private address space are divergent, because threads
315  // can execute the load instruction with the same inputs and get different
316  // results.
317  //
318  // All other loads are not divergent, because if threads issue loads with the
319  // same arguments, they will always get the same result.
320  if (const LoadInst *Load = dyn_cast<LoadInst>(V))
321  return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
322 
323  // Atomics are divergent because they are executed sequentially: when an
324  // atomic operation refers to the same address in each thread, then each
325  // thread after the first sees the value written by the previous thread as
326  // original value.
327  if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
328  return true;
329 
330  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
331  const TargetMachine &TM = getTLI()->getTargetMachine();
332  return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic);
333  }
334 
335  // Assume all function calls are a source of divergence.
336  if (isa<CallInst>(V) || isa<InvokeInst>(V))
337  return true;
338 
339  return false;
340 }
MachineLoop * L
A parsed version of the target data layout string in and methods for querying it. ...
Definition: DataLayout.h:102
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
This file a TargetTransformInfo::Concept conforming object specific to the AMDGPU target machine...
unsigned getNumberOfRegisters(bool Vector)
LLVM Argument representation.
Definition: Argument.h:34
void getUnrollingPreferences(Loop *L, TTI::UnrollingPreferences &UP)
Cost tables and simple lookup functions.
const TargetMachine & getTargetMachine() const
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:51
An instruction for reading from memory.
Definition: Instructions.h:164
Address space for local memory.
Definition: AMDGPU.h:141
Hexagon Common GEP
static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII, const IntrinsicInst *I)
bool hasAttribute(unsigned Index, Attribute::AttrKind Kind) const
Return true if the attribute exists at the given index.
Definition: Attributes.cpp:994
const std::vector< BlockT * > & getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:139
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition: Function.h:165
StringRef getName() const
Return a constant reference to the value's name.
Definition: Value.cpp:191
bool isSourceOfDivergence(const Value *V) const
unsigned getRegisterBitWidth(bool Vector)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
const HexagonInstrInfo * TII
Shift and rotation operations.
Definition: ISDOpcodes.h:344
unsigned getMaxInterleaveFactor(unsigned VF)
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
#define F(x, y, z)
Definition: MD5.cpp:51
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:200
virtual unsigned lookupName(const char *Name, unsigned Len) const =0
Look up target intrinsic by name.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory)...
Definition: APInt.h:33
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
LLVM_NODISCARD LLVM_ATTRIBUTE_ALWAYS_INLINE size_t size() const
size - Get the string size.
Definition: StringRef.h:135
Generation getGeneration() const
Address space for region memory.
Definition: AMDGPU.h:143
an instruction for type-safe pointer arithmetic to access elements of arrays and structs ...
Definition: Instructions.h:830
bool isShader(CallingConv::ID cc)
LLVM Basic Block Representation.
Definition: BasicBlock.h:51
The instances of the Type class are immutable: once they are created, they are never changed...
Definition: Type.h:45
const Function * getParent() const
Definition: Argument.h:48
Address space for constant memory (VTX2)
Definition: AMDGPU.h:140
Simple binary floating point operators.
Definition: ISDOpcodes.h:246
Address space for private memory.
Definition: AMDGPU.h:138
bool hasFP32Denormals() const
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static bool isArgPassedInSGPR(const Argument *A)
Address space for flat memory.
Definition: AMDGPU.h:142
EVT - Extended Value Type.
Definition: ValueTypes.h:31
Value * GetUnderlyingObject(Value *V, const DataLayout &DL, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value...
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
TargetIntrinsicInfo - Interface to description of machine instruction set.
OperandValueProperties
Additional properties of an operand's values.
unsigned getAddressSpace() const
Returns the address space of this instruction's pointer type.
Definition: Instructions.h:940
unsigned getCFInstrCost(unsigned Opcode)
Definition: BasicTTIImpl.h:485
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
Definition: BasicTTIImpl.h:306
Module.h This file contains the declarations for the Module class.
unsigned getCFInstrCost(unsigned Opcode)
Function * getCalledFunction() const
Return the function called, or null if this is an indirect function invocation.
AttributeSet getAttributes() const
Return the attribute list for this Function.
Definition: Function.h:176
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index)
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:333
unsigned Threshold
The cost threshold for the unrolled loop.
unsigned getMaxPrivateElementSize() const
virtual const TargetIntrinsicInfo * getIntrinsicInfo() const
If intrinsic information is available, return it. If not, return null.
Address space for indirect addressible parameter memory (VTX1)
Definition: AMDGPU.h:145
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:368
Parameters that control the generic loop unrolling transformation.
#define I(x, y, z)
Definition: MD5.cpp:54
LLVM_NODISCARD std::enable_if<!is_simple_type< Y >::value, typename cast_retty< X, const Y >::ret_type >::type dyn_cast(const Y &Val)
Definition: Casting.h:287
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:529
Address space for direct addressible parameter memory (CONST0)
Definition: AMDGPU.h:144
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool isSimple() const
isSimple - Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:107
const unsigned char * bytes_begin() const
Definition: StringRef.h:107
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >())
LLVM Value Representation.
Definition: Value.h:71
unsigned getArgNo() const
Return the index of this formal argument in its containing function.
Definition: Function.cpp:57
Primary interface to the complete machine description for the target machine.
OperandValueKind
Additional information about an operand's possible values.
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:47
This pass exposes codegen information to IR-level passes.
int * Ptr
static GCRegistry::Add< ErlangGC > A("erlang","erlang-compatible garbage collector")
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:44
This file describes how to lower LLVM code to machine code.
an instruction to allocate memory on the stack
Definition: Instructions.h:60
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:139