Line data Source code
1 : //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 : //
3 : // The LLVM Compiler Infrastructure
4 : //
5 : // This file is distributed under the University of Illinois Open Source
6 : // License. See LICENSE.TXT for details.
7 : //
8 : //===----------------------------------------------------------------------===//
9 : //
10 : // \file
11 : // This file implements a TargetTransformInfo analysis pass specific to the
12 : // AMDGPU target machine. It uses the target's detailed information to provide
13 : // more precise answers to certain TTI queries, while letting the target
14 : // independent and default TTI implementations handle the rest.
15 : //
16 : //===----------------------------------------------------------------------===//
17 :
18 : #include "AMDGPUTargetTransformInfo.h"
19 : #include "AMDGPUSubtarget.h"
20 : #include "Utils/AMDGPUBaseInfo.h"
21 : #include "llvm/ADT/STLExtras.h"
22 : #include "llvm/Analysis/LoopInfo.h"
23 : #include "llvm/Analysis/TargetTransformInfo.h"
24 : #include "llvm/Analysis/ValueTracking.h"
25 : #include "llvm/CodeGen/ISDOpcodes.h"
26 : #include "llvm/CodeGen/ValueTypes.h"
27 : #include "llvm/IR/Argument.h"
28 : #include "llvm/IR/Attributes.h"
29 : #include "llvm/IR/BasicBlock.h"
30 : #include "llvm/IR/CallingConv.h"
31 : #include "llvm/IR/DataLayout.h"
32 : #include "llvm/IR/DerivedTypes.h"
33 : #include "llvm/IR/Function.h"
34 : #include "llvm/IR/Instruction.h"
35 : #include "llvm/IR/Instructions.h"
36 : #include "llvm/IR/IntrinsicInst.h"
37 : #include "llvm/IR/Module.h"
38 : #include "llvm/IR/PatternMatch.h"
39 : #include "llvm/IR/Type.h"
40 : #include "llvm/IR/Value.h"
41 : #include "llvm/MC/SubtargetFeature.h"
42 : #include "llvm/Support/Casting.h"
43 : #include "llvm/Support/CommandLine.h"
44 : #include "llvm/Support/Debug.h"
45 : #include "llvm/Support/ErrorHandling.h"
46 : #include "llvm/Support/MachineValueType.h"
47 : #include "llvm/Support/raw_ostream.h"
48 : #include "llvm/Target/TargetMachine.h"
49 : #include <algorithm>
50 : #include <cassert>
51 : #include <limits>
52 : #include <utility>
53 :
54 : using namespace llvm;
55 :
56 : #define DEBUG_TYPE "AMDGPUtti"
57 :
58 : static cl::opt<unsigned> UnrollThresholdPrivate(
59 : "amdgpu-unroll-threshold-private",
60 : cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
61 : cl::init(2500), cl::Hidden);
62 :
63 : static cl::opt<unsigned> UnrollThresholdLocal(
64 : "amdgpu-unroll-threshold-local",
65 : cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
66 : cl::init(1000), cl::Hidden);
67 :
68 : static cl::opt<unsigned> UnrollThresholdIf(
69 : "amdgpu-unroll-threshold-if",
70 : cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
71 : cl::init(150), cl::Hidden);
72 :
73 8 : static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
74 : unsigned Depth = 0) {
75 : const Instruction *I = dyn_cast<Instruction>(Cond);
76 : if (!I)
77 : return false;
78 :
79 16 : for (const Value *V : I->operand_values()) {
80 8 : if (!L->contains(I))
81 : continue;
82 : if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
83 6 : if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
84 : return SubLoop->contains(PHI); }))
85 : return true;
86 2 : } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
87 : return true;
88 : }
89 : return false;
90 : }
91 :
92 20 : void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
93 : TTI::UnrollingPreferences &UP) {
94 20 : UP.Threshold = 300; // Twice the default.
95 20 : UP.MaxCount = std::numeric_limits<unsigned>::max();
96 20 : UP.Partial = true;
97 :
98 : // TODO: Do we want runtime unrolling?
99 :
100 : // Maximum alloca size than can fit registers. Reserve 16 registers.
101 : const unsigned MaxAlloca = (256 - 16) * 4;
102 20 : unsigned ThresholdPrivate = UnrollThresholdPrivate;
103 20 : unsigned ThresholdLocal = UnrollThresholdLocal;
104 20 : unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
105 54 : for (const BasicBlock *BB : L->getBlocks()) {
106 37 : const DataLayout &DL = BB->getModule()->getDataLayout();
107 : unsigned LocalGEPsSeen = 0;
108 :
109 37 : if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
110 : return SubLoop->contains(BB); }))
111 : continue; // Block belongs to an inner loop.
112 :
113 223 : for (const Instruction &I : *BB) {
114 : // Unroll a loop which contains an "if" statement whose condition
115 : // defined by a PHI belonging to the loop. This may help to eliminate
116 : // if region and potentially even PHI itself, saving on both divergence
117 : // and registers used for the PHI.
118 : // Add a small bonus for each of such "if" statements.
119 : if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
120 34 : if (UP.Threshold < MaxBoost && Br->isConditional()) {
121 35 : if (L->isLoopExiting(Br->getSuccessor(0)) ||
122 14 : L->isLoopExiting(Br->getSuccessor(1)))
123 15 : continue;
124 6 : if (dependsOnLocalPhi(L, Br->getCondition())) {
125 6 : UP.Threshold += UnrollThresholdIf;
126 : LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
127 : << " for loop:\n"
128 : << *L << " due to " << *Br << '\n');
129 6 : if (UP.Threshold >= MaxBoost)
130 : return;
131 : }
132 : }
133 19 : continue;
134 : }
135 :
136 : const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
137 : if (!GEP)
138 : continue;
139 :
140 : unsigned AS = GEP->getAddressSpace();
141 : unsigned Threshold = 0;
142 30 : if (AS == AMDGPUAS::PRIVATE_ADDRESS)
143 5 : Threshold = ThresholdPrivate;
144 25 : else if (AS == AMDGPUAS::LOCAL_ADDRESS)
145 2 : Threshold = ThresholdLocal;
146 : else
147 : continue;
148 :
149 7 : if (UP.Threshold >= Threshold)
150 : continue;
151 :
152 7 : if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
153 : const Value *Ptr = GEP->getPointerOperand();
154 : const AllocaInst *Alloca =
155 : dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
156 5 : if (!Alloca || !Alloca->isStaticAlloca())
157 1 : continue;
158 4 : Type *Ty = Alloca->getAllocatedType();
159 4 : unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
160 4 : if (AllocaSize > MaxAlloca)
161 : continue;
162 2 : } else if (AS == AMDGPUAS::LOCAL_ADDRESS) {
163 2 : LocalGEPsSeen++;
164 : // Inhibit unroll for local memory if we have seen addressing not to
165 : // a variable, most likely we will be unable to combine it.
166 : // Do not unroll too deep inner loops for local memory to give a chance
167 : // to unroll an outer loop for a more important reason.
168 4 : if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
169 2 : (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
170 : !isa<Argument>(GEP->getPointerOperand())))
171 : continue;
172 : }
173 :
174 : // Check if GEP depends on a value defined by this loop itself.
175 : bool HasLoopDef = false;
176 18 : for (const Value *Op : GEP->operands()) {
177 : const Instruction *Inst = dyn_cast<Instruction>(Op);
178 8 : if (!Inst || L->isLoopInvariant(Op))
179 8 : continue;
180 :
181 5 : if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
182 : return SubLoop->contains(Inst); }))
183 : continue;
184 : HasLoopDef = true;
185 : break;
186 : }
187 5 : if (!HasLoopDef)
188 : continue;
189 :
190 : // We want to do whatever we can to limit the number of alloca
191 : // instructions that make it through to the code generator. allocas
192 : // require us to use indirect addressing, which is slow and prone to
193 : // compiler bugs. If this loop does an address calculation on an
194 : // alloca ptr, then we want to use a higher than normal loop unroll
195 : // threshold. This will give SROA a better chance to eliminate these
196 : // allocas.
197 : //
198 : // We also want to have more unrolling for local memory to let ds
199 : // instructions with different offsets combine.
200 : //
201 : // Don't use the maximum allowed value here as it will make some
202 : // programs way too big.
203 5 : UP.Threshold = Threshold;
204 : LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
205 : << " for loop:\n"
206 : << *L << " due to " << *GEP << '\n');
207 5 : if (UP.Threshold >= MaxBoost)
208 : return;
209 : }
210 : }
211 : }
212 :
213 2540 : unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
214 : // The concept of vector registers doesn't really exist. Some packed vector
215 : // operations operate on the normal 32-bit registers.
216 2540 : return 256;
217 : }
218 :
219 2540 : unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
220 : // This is really the number of registers to fill when vectorizing /
221 : // interleaving loops, so we lie to avoid trying to use all registers.
222 2540 : return getHardwareNumberOfRegisters(Vec) >> 3;
223 : }
224 :
225 104 : unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
226 104 : return 32;
227 : }
228 :
229 90 : unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
230 90 : return 32;
231 : }
232 :
233 13598 : unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
234 : unsigned ChainSizeInBytes,
235 : VectorType *VecTy) const {
236 13598 : unsigned VecRegBitWidth = VF * LoadSize;
237 13598 : if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
238 : // TODO: Support element-size less than 32bit?
239 2639 : return 128 / LoadSize;
240 :
241 : return VF;
242 : }
243 :
244 3662 : unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
245 : unsigned ChainSizeInBytes,
246 : VectorType *VecTy) const {
247 3662 : unsigned VecRegBitWidth = VF * StoreSize;
248 3662 : if (VecRegBitWidth > 128)
249 3302 : return 128 / StoreSize;
250 :
251 : return VF;
252 : }
253 :
254 64732 : unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
255 129464 : if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
256 64732 : AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
257 : AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
258 : return 512;
259 : }
260 :
261 12814 : if (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
262 6407 : AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
263 : AddrSpace == AMDGPUAS::REGION_ADDRESS)
264 : return 128;
265 :
266 957 : if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
267 957 : return 8 * ST->getMaxPrivateElementSize();
268 :
269 0 : llvm_unreachable("unhandled address space");
270 : }
271 :
272 8914 : bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
273 : unsigned Alignment,
274 : unsigned AddrSpace) const {
275 : // We allow vectorization of flat stores, even though we may need to decompose
276 : // them later if they may access private memory. We don't have enough context
277 : // here, and legalization can handle it.
278 8914 : if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
279 106 : return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
280 66 : ChainSizeInBytes <= ST->getMaxPrivateElementSize();
281 : }
282 : return true;
283 : }
284 :
285 8590 : bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
286 : unsigned Alignment,
287 : unsigned AddrSpace) const {
288 8590 : return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
289 : }
290 :
291 324 : bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
292 : unsigned Alignment,
293 : unsigned AddrSpace) const {
294 324 : return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
295 : }
296 :
297 8 : unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
298 : // Disable unrolling if the loop is not vectorized.
299 : // TODO: Enable this again.
300 8 : if (VF == 1)
301 6 : return 1;
302 :
303 : return 8;
304 : }
305 :
306 9580 : bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
307 : MemIntrinsicInfo &Info) const {
308 : switch (Inst->getIntrinsicID()) {
309 749 : case Intrinsic::amdgcn_atomic_inc:
310 : case Intrinsic::amdgcn_atomic_dec:
311 : case Intrinsic::amdgcn_ds_fadd:
312 : case Intrinsic::amdgcn_ds_fmin:
313 : case Intrinsic::amdgcn_ds_fmax: {
314 749 : auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
315 : auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
316 749 : if (!Ordering || !Volatile)
317 : return false; // Invalid.
318 :
319 731 : unsigned OrderingVal = Ordering->getZExtValue();
320 731 : if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
321 : return false;
322 :
323 731 : Info.PtrVal = Inst->getArgOperand(0);
324 731 : Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
325 731 : Info.ReadMem = true;
326 731 : Info.WriteMem = true;
327 731 : Info.IsVolatile = !Volatile->isNullValue();
328 731 : return true;
329 : }
330 : default:
331 : return false;
332 : }
333 : }
334 :
335 313 : int GCNTTIImpl::getArithmeticInstrCost(
336 : unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
337 : TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
338 : TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args ) {
339 313 : EVT OrigTy = TLI->getValueType(DL, Ty);
340 313 : if (!OrigTy.isSimple()) {
341 102 : return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
342 34 : Opd1PropInfo, Opd2PropInfo);
343 : }
344 :
345 : // Legalize the type.
346 279 : std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
347 279 : int ISD = TLI->InstructionOpcodeToISD(Opcode);
348 :
349 : // Because we don't have any legal vector operations, but the legal types, we
350 : // need to account for split vectors.
351 558 : unsigned NElts = LT.second.isVector() ?
352 : LT.second.getVectorNumElements() : 1;
353 :
354 : MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
355 :
356 279 : switch (ISD) {
357 12 : case ISD::SHL:
358 : case ISD::SRL:
359 : case ISD::SRA:
360 12 : if (SLT == MVT::i64)
361 9 : return get64BitInstrCost() * LT.first * NElts;
362 :
363 : // i32
364 6 : return getFullRateInstrCost() * LT.first * NElts;
365 67 : case ISD::ADD:
366 : case ISD::SUB:
367 : case ISD::AND:
368 : case ISD::OR:
369 : case ISD::XOR:
370 67 : if (SLT == MVT::i64){
371 : // and, or and xor are typically split into 2 VALU instructions.
372 26 : return 2 * getFullRateInstrCost() * LT.first * NElts;
373 : }
374 :
375 41 : return LT.first * NElts * getFullRateInstrCost();
376 : case ISD::MUL: {
377 : const int QuarterRateCost = getQuarterRateInstrCost();
378 9 : if (SLT == MVT::i64) {
379 : const int FullRateCost = getFullRateInstrCost();
380 5 : return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
381 : }
382 :
383 : // i32
384 4 : return QuarterRateCost * NElts * LT.first;
385 : }
386 107 : case ISD::FADD:
387 : case ISD::FSUB:
388 : case ISD::FMUL:
389 107 : if (SLT == MVT::f64)
390 27 : return LT.first * NElts * get64BitInstrCost();
391 :
392 89 : if (SLT == MVT::f32 || SLT == MVT::f16)
393 89 : return LT.first * NElts * getFullRateInstrCost();
394 : break;
395 84 : case ISD::FDIV:
396 : case ISD::FREM:
397 : // FIXME: frem should be handled separately. The fdiv in it is most of it,
398 : // but the current lowering is also not entirely correct.
399 84 : if (SLT == MVT::f64) {
400 24 : int Cost = 4 * get64BitInstrCost() + 7 * getQuarterRateInstrCost();
401 : // Add cost of workaround.
402 24 : if (ST->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
403 8 : Cost += 3 * getFullRateInstrCost();
404 :
405 24 : return LT.first * Cost * NElts;
406 60 : }
407 :
408 60 : if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
409 : // TODO: This is more complicated, unsafe flags etc.
410 24 : if ((SLT == MVT::f32 && !ST->hasFP32Denormals()) ||
411 2 : (SLT == MVT::f16 && ST->has16BitInsts())) {
412 18 : return LT.first * getQuarterRateInstrCost() * NElts;
413 : }
414 : }
415 :
416 42 : if (SLT == MVT::f16 && ST->has16BitInsts()) {
417 : // 2 x v_cvt_f32_f16
418 : // f32 rcp
419 : // f32 fmul
420 : // v_cvt_f16_f32
421 : // f16 div_fixup
422 : int Cost = 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost();
423 3 : return LT.first * Cost * NElts;
424 : }
425 :
426 39 : if (SLT == MVT::f32 || SLT == MVT::f16) {
427 : int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
428 :
429 39 : if (!ST->hasFP32Denormals()) {
430 : // FP mode switches.
431 : Cost += 2 * getFullRateInstrCost();
432 : }
433 :
434 39 : return LT.first * NElts * Cost;
435 0 : }
436 : break;
437 : default:
438 : break;
439 : }
440 :
441 0 : return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
442 0 : Opd1PropInfo, Opd2PropInfo);
443 : }
444 :
445 314 : unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) {
446 : // XXX - For some reason this isn't called for switch.
447 314 : switch (Opcode) {
448 : case Instruction::Br:
449 : case Instruction::Ret:
450 : return 10;
451 17 : default:
452 17 : return BaseT::getCFInstrCost(Opcode);
453 : }
454 : }
455 :
456 24 : int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
457 : bool IsPairwise) {
458 24 : EVT OrigTy = TLI->getValueType(DL, Ty);
459 :
460 : // Computes cost on targets that have packed math instructions(which support
461 : // 16-bit types only).
462 12 : if (IsPairwise ||
463 30 : !ST->hasVOP3PInsts() ||
464 : OrigTy.getScalarSizeInBits() != 16)
465 19 : return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise);
466 :
467 5 : std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
468 5 : return LT.first * getFullRateInstrCost();
469 : }
470 :
471 38 : int GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy,
472 : bool IsPairwise,
473 : bool IsUnsigned) {
474 38 : EVT OrigTy = TLI->getValueType(DL, Ty);
475 :
476 : // Computes cost on targets that have packed math instructions(which support
477 : // 16-bit types only).
478 19 : if (IsPairwise ||
479 50 : !ST->hasVOP3PInsts() ||
480 : OrigTy.getScalarSizeInBits() != 16)
481 31 : return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned);
482 :
483 7 : std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
484 7 : return LT.first * getHalfRateInstrCost();
485 : }
486 :
487 1381 : int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
488 : unsigned Index) {
489 1381 : switch (Opcode) {
490 1381 : case Instruction::ExtractElement:
491 : case Instruction::InsertElement: {
492 : unsigned EltSize
493 1381 : = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
494 1381 : if (EltSize < 32) {
495 776 : if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
496 : return 0;
497 612 : return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
498 : }
499 :
500 : // Extracts are just reads of a subregister, so are free. Inserts are
501 : // considered free because we don't want to have any cost for scalarizing
502 : // operations, and we don't have to copy into a different register class.
503 :
504 : // Dynamic indexing isn't free and is best avoided.
505 605 : return Index == ~0u ? 2 : 0;
506 : }
507 0 : default:
508 0 : return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
509 : }
510 : }
511 :
512 :
513 :
514 225692 : static bool isArgPassedInSGPR(const Argument *A) {
515 225692 : const Function *F = A->getParent();
516 :
517 : // Arguments to compute shaders are never a source of divergence.
518 : CallingConv::ID CC = F->getCallingConv();
519 : switch (CC) {
520 : case CallingConv::AMDGPU_KERNEL:
521 : case CallingConv::SPIR_KERNEL:
522 : return true;
523 37203 : case CallingConv::AMDGPU_VS:
524 : case CallingConv::AMDGPU_LS:
525 : case CallingConv::AMDGPU_HS:
526 : case CallingConv::AMDGPU_ES:
527 : case CallingConv::AMDGPU_GS:
528 : case CallingConv::AMDGPU_PS:
529 : case CallingConv::AMDGPU_CS:
530 : // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
531 : // Everything else is in VGPRs.
532 62332 : return F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::InReg) ||
533 62332 : F->getAttributes().hasParamAttribute(A->getArgNo(), Attribute::ByVal);
534 : default:
535 : // TODO: Should calls support inreg for SGPR inputs?
536 : return false;
537 : }
538 : }
539 :
540 : /// \returns true if the result of the value could potentially be
541 : /// different across workitems in a wavefront.
542 1444647 : bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
543 : if (const Argument *A = dyn_cast<Argument>(V))
544 225692 : return !isArgPassedInSGPR(A);
545 :
546 : // Loads from the private and flat address spaces are divergent, because
547 : // threads can execute the load instruction with the same inputs and get
548 : // different results.
549 : //
550 : // All other loads are not divergent, because if threads issue loads with the
551 : // same arguments, they will always get the same result.
552 : if (const LoadInst *Load = dyn_cast<LoadInst>(V))
553 151008 : return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
554 : Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
555 :
556 : // Atomics are divergent because they are executed sequentially: when an
557 : // atomic operation refers to the same address in each thread, then each
558 : // thread after the first sees the value written by the previous thread as
559 : // original value.
560 : if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
561 : return true;
562 :
563 : if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
564 132435 : return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
565 :
566 : // Assume all function calls are a source of divergence.
567 : if (isa<CallInst>(V) || isa<InvokeInst>(V))
568 9861 : return true;
569 :
570 : return false;
571 : }
572 :
573 254054 : bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
574 : if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
575 46706 : switch (Intrinsic->getIntrinsicID()) {
576 : default:
577 : return false;
578 492 : case Intrinsic::amdgcn_readfirstlane:
579 : case Intrinsic::amdgcn_readlane:
580 492 : return true;
581 : }
582 : }
583 : return false;
584 : }
585 :
586 85 : unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
587 : Type *SubTp) {
588 85 : if (ST->hasVOP3PInsts()) {
589 : VectorType *VT = cast<VectorType>(Tp);
590 49 : if (VT->getNumElements() == 2 &&
591 9 : DL.getTypeSizeInBits(VT->getElementType()) == 16) {
592 : // With op_sel VOP3P instructions freely can access the low half or high
593 : // half of a register, so any swizzle is free.
594 :
595 : switch (Kind) {
596 : case TTI::SK_Broadcast:
597 : case TTI::SK_Reverse:
598 : case TTI::SK_PermuteSingleSrc:
599 : return 0;
600 : default:
601 : break;
602 : }
603 : }
604 : }
605 :
606 81 : return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
607 : }
608 :
609 127 : bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
610 : const Function *Callee) const {
611 127 : const TargetMachine &TM = getTLI()->getTargetMachine();
612 : const FeatureBitset &CallerBits =
613 127 : TM.getSubtargetImpl(*Caller)->getFeatureBits();
614 : const FeatureBitset &CalleeBits =
615 127 : TM.getSubtargetImpl(*Callee)->getFeatureBits();
616 :
617 127 : FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
618 127 : FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
619 127 : return ((RealCallerBits & RealCalleeBits) == RealCalleeBits);
620 : }
621 :
622 17 : void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
623 : TTI::UnrollingPreferences &UP) {
624 17 : CommonTTI.getUnrollingPreferences(L, SE, UP);
625 17 : }
626 :
627 238 : unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
628 238 : return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
629 : }
630 :
631 238 : unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
632 238 : return getHardwareNumberOfRegisters(Vec);
633 : }
634 :
635 11 : unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
636 11 : return 32;
637 : }
638 :
639 11 : unsigned R600TTIImpl::getMinVectorRegisterBitWidth() const {
640 11 : return 32;
641 : }
642 :
643 3914 : unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
644 7828 : if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
645 3914 : AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
646 : return 128;
647 939 : if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
648 : AddrSpace == AMDGPUAS::REGION_ADDRESS)
649 : return 64;
650 429 : if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
651 : return 32;
652 :
653 285 : if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
654 285 : AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
655 : (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
656 : AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
657 : return 128;
658 0 : llvm_unreachable("unhandled address space");
659 : }
660 :
661 97 : bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
662 : unsigned Alignment,
663 : unsigned AddrSpace) const {
664 : // We allow vectorization of flat stores, even though we may need to decompose
665 : // them later if they may access private memory. We don't have enough context
666 : // here, and legalization can handle it.
667 97 : return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
668 : }
669 :
670 67 : bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
671 : unsigned Alignment,
672 : unsigned AddrSpace) const {
673 67 : return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
674 : }
675 :
676 30 : bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
677 : unsigned Alignment,
678 : unsigned AddrSpace) const {
679 30 : return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
680 : }
681 :
682 0 : unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
683 : // Disable unrolling if the loop is not vectorized.
684 : // TODO: Enable this again.
685 0 : if (VF == 1)
686 0 : return 1;
687 :
688 : return 8;
689 : }
690 :
691 0 : unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) {
692 : // XXX - For some reason this isn't called for switch.
693 0 : switch (Opcode) {
694 : case Instruction::Br:
695 : case Instruction::Ret:
696 : return 10;
697 0 : default:
698 0 : return BaseT::getCFInstrCost(Opcode);
699 : }
700 : }
701 :
702 0 : int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
703 : unsigned Index) {
704 0 : switch (Opcode) {
705 0 : case Instruction::ExtractElement:
706 : case Instruction::InsertElement: {
707 : unsigned EltSize
708 0 : = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
709 0 : if (EltSize < 32) {
710 0 : return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
711 : }
712 :
713 : // Extracts are just reads of a subregister, so are free. Inserts are
714 : // considered free because we don't want to have any cost for scalarizing
715 : // operations, and we don't have to copy into a different register class.
716 :
717 : // Dynamic indexing isn't free and is best avoided.
718 0 : return Index == ~0u ? 2 : 0;
719 : }
720 0 : default:
721 0 : return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
722 : }
723 : }
724 :
725 3 : void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
726 : TTI::UnrollingPreferences &UP) {
727 3 : CommonTTI.getUnrollingPreferences(L, SE, UP);
728 3 : }
|