LLVM  13.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1 //===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // \file
10 // This file implements a TargetTransformInfo analysis pass specific to the
11 // AMDGPU target machine. It uses the target's detailed information to provide
12 // more precise answers to certain TTI queries, while letting the target
13 // independent and default TTI implementations handle the rest.
14 //
15 //===----------------------------------------------------------------------===//
16 
18 #include "AMDGPUTargetMachine.h"
19 #include "llvm/Analysis/LoopInfo.h"
21 #include "llvm/IR/IntrinsicsAMDGPU.h"
22 #include "llvm/IR/PatternMatch.h"
23 #include "llvm/Support/KnownBits.h"
24 
25 using namespace llvm;
26 
27 #define DEBUG_TYPE "AMDGPUtti"
28 
30  "amdgpu-unroll-threshold-private",
31  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
32  cl::init(2700), cl::Hidden);
33 
35  "amdgpu-unroll-threshold-local",
36  cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
37  cl::init(1000), cl::Hidden);
38 
40  "amdgpu-unroll-threshold-if",
41  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
42  cl::init(150), cl::Hidden);
43 
45  "amdgpu-unroll-runtime-local",
46  cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
47  cl::init(true), cl::Hidden);
48 
50  "amdgpu-use-legacy-divergence-analysis",
51  cl::desc("Enable legacy divergence analysis for AMDGPU"),
52  cl::init(false), cl::Hidden);
53 
55  "amdgpu-unroll-max-block-to-analyze",
56  cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
57  cl::init(32), cl::Hidden);
58 
59 static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
60  cl::Hidden, cl::init(4000),
61  cl::desc("Cost of alloca argument"));
62 
63 // If the amount of scratch memory to eliminate exceeds our ability to allocate
64 // it into registers we gain nothing by aggressively inlining functions for that
65 // heuristic.
66 static cl::opt<unsigned>
67  ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
68  cl::init(256),
69  cl::desc("Maximum alloca size to use for inline cost"));
70 
71 // Inliner constraint to achieve reasonable compilation time.
73  "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
74  cl::desc("Maximum number of BBs allowed in a function after inlining"
75  " (compile time constraint)"));
76 
77 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
78  unsigned Depth = 0) {
79  const Instruction *I = dyn_cast<Instruction>(Cond);
80  if (!I)
81  return false;
82 
83  for (const Value *V : I->operand_values()) {
84  if (!L->contains(I))
85  continue;
86  if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
87  if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
88  return SubLoop->contains(PHI); }))
89  return true;
90  } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
91  return true;
92  }
93  return false;
94 }
95 
97  : BaseT(TM, F.getParent()->getDataLayout()),
98  TargetTriple(TM->getTargetTriple()),
99  ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
100  TLI(ST->getTargetLowering()) {}
101 
104  const Function &F = *L->getHeader()->getParent();
105  UP.Threshold = AMDGPU::getIntegerAttribute(F, "amdgpu-unroll-threshold", 300);
107  UP.Partial = true;
108 
109  // TODO: Do we want runtime unrolling?
110 
111  // Maximum alloca size than can fit registers. Reserve 16 registers.
112  const unsigned MaxAlloca = (256 - 16) * 4;
113  unsigned ThresholdPrivate = UnrollThresholdPrivate;
114  unsigned ThresholdLocal = UnrollThresholdLocal;
115 
116  // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
117  // provided threshold value as the default for Threshold
118  if (MDNode *LoopUnrollThreshold =
119  findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
120  if (LoopUnrollThreshold->getNumOperands() == 2) {
121  ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
122  LoopUnrollThreshold->getOperand(1));
123  if (MetaThresholdValue) {
124  // We will also use the supplied value for PartialThreshold for now.
125  // We may introduce additional metadata if it becomes necessary in the
126  // future.
127  UP.Threshold = MetaThresholdValue->getSExtValue();
128  UP.PartialThreshold = UP.Threshold;
129  ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
130  ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
131  }
132  }
133  }
134 
135  unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
136  for (const BasicBlock *BB : L->getBlocks()) {
137  const DataLayout &DL = BB->getModule()->getDataLayout();
138  unsigned LocalGEPsSeen = 0;
139 
140  if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
141  return SubLoop->contains(BB); }))
142  continue; // Block belongs to an inner loop.
143 
144  for (const Instruction &I : *BB) {
145  // Unroll a loop which contains an "if" statement whose condition
146  // defined by a PHI belonging to the loop. This may help to eliminate
147  // if region and potentially even PHI itself, saving on both divergence
148  // and registers used for the PHI.
149  // Add a small bonus for each of such "if" statements.
150  if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
151  if (UP.Threshold < MaxBoost && Br->isConditional()) {
152  BasicBlock *Succ0 = Br->getSuccessor(0);
153  BasicBlock *Succ1 = Br->getSuccessor(1);
154  if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
155  (L->contains(Succ1) && L->isLoopExiting(Succ1)))
156  continue;
157  if (dependsOnLocalPhi(L, Br->getCondition())) {
159  LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
160  << " for loop:\n"
161  << *L << " due to " << *Br << '\n');
162  if (UP.Threshold >= MaxBoost)
163  return;
164  }
165  }
166  continue;
167  }
168 
169  const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
170  if (!GEP)
171  continue;
172 
173  unsigned AS = GEP->getAddressSpace();
174  unsigned Threshold = 0;
175  if (AS == AMDGPUAS::PRIVATE_ADDRESS)
176  Threshold = ThresholdPrivate;
177  else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS)
178  Threshold = ThresholdLocal;
179  else
180  continue;
181 
182  if (UP.Threshold >= Threshold)
183  continue;
184 
185  if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
186  const Value *Ptr = GEP->getPointerOperand();
187  const AllocaInst *Alloca =
188  dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
189  if (!Alloca || !Alloca->isStaticAlloca())
190  continue;
191  Type *Ty = Alloca->getAllocatedType();
192  unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
193  if (AllocaSize > MaxAlloca)
194  continue;
195  } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
196  AS == AMDGPUAS::REGION_ADDRESS) {
197  LocalGEPsSeen++;
198  // Inhibit unroll for local memory if we have seen addressing not to
199  // a variable, most likely we will be unable to combine it.
200  // Do not unroll too deep inner loops for local memory to give a chance
201  // to unroll an outer loop for a more important reason.
202  if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
203  (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
204  !isa<Argument>(GEP->getPointerOperand())))
205  continue;
206  LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
207  << *L << " due to LDS use.\n");
209  }
210 
211  // Check if GEP depends on a value defined by this loop itself.
212  bool HasLoopDef = false;
213  for (const Value *Op : GEP->operands()) {
214  const Instruction *Inst = dyn_cast<Instruction>(Op);
215  if (!Inst || L->isLoopInvariant(Op))
216  continue;
217 
218  if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
219  return SubLoop->contains(Inst); }))
220  continue;
221  HasLoopDef = true;
222  break;
223  }
224  if (!HasLoopDef)
225  continue;
226 
227  // We want to do whatever we can to limit the number of alloca
228  // instructions that make it through to the code generator. allocas
229  // require us to use indirect addressing, which is slow and prone to
230  // compiler bugs. If this loop does an address calculation on an
231  // alloca ptr, then we want to use a higher than normal loop unroll
232  // threshold. This will give SROA a better chance to eliminate these
233  // allocas.
234  //
235  // We also want to have more unrolling for local memory to let ds
236  // instructions with different offsets combine.
237  //
238  // Don't use the maximum allowed value here as it will make some
239  // programs way too big.
240  UP.Threshold = Threshold;
241  LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
242  << " for loop:\n"
243  << *L << " due to " << *GEP << '\n');
244  if (UP.Threshold >= MaxBoost)
245  return;
246  }
247 
248  // If we got a GEP in a small BB from inner loop then increase max trip
249  // count to analyze for better estimation cost in unroll
250  if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
252  }
253 }
254 
257  BaseT::getPeelingPreferences(L, SE, PP);
258 }
259 
260 const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
261  // Codegen control options which don't matter.
262  AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
263  AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
264  AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
265  AMDGPU::FeatureUnalignedAccessMode,
266 
267  AMDGPU::FeatureAutoWaitcntBeforeBarrier,
268 
269  // Property of the kernel/environment which can't actually differ.
270  AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
271  AMDGPU::FeatureTrapHandler,
272 
273  // The default assumption needs to be ecc is enabled, but no directly
274  // exposed operations depend on it, so it can be safely inlined.
275  AMDGPU::FeatureSRAMECC,
276 
277  // Perf-tuning features
278  AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
279 
281  : BaseT(TM, F.getParent()->getDataLayout()),
282  ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
283  TLI(ST->getTargetLowering()), CommonTTI(TM, F),
284  IsGraphics(AMDGPU::isGraphics(F.getCallingConv())),
285  MaxVGPRs(ST->getMaxNumVGPRs(
286  std::max(ST->getWavesPerEU(F).first,
288  ST->getFlatWorkGroupSizes(F).second)))) {
290  HasFP32Denormals = Mode.allFP32Denormals();
291  HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
292 }
293 
295  // The concept of vector registers doesn't really exist. Some packed vector
296  // operations operate on the normal 32-bit registers.
297  return MaxVGPRs;
298 }
299 
300 unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
301  // This is really the number of registers to fill when vectorizing /
302  // interleaving loops, so we lie to avoid trying to use all registers.
303  return getHardwareNumberOfRegisters(Vec) >> 3;
304 }
305 
306 unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
307  const SIRegisterInfo *TRI = ST->getRegisterInfo();
308  const TargetRegisterClass *RC = TRI->getRegClass(RCID);
309  unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32;
310  return getHardwareNumberOfRegisters(false) / NumVGPRs;
311 }
312 
313 unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
314  return (Vector && ST->hasPackedFP32Ops()) ? 64 : 32;
315 }
316 
318  return 32;
319 }
320 
321 unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
322  if (Opcode == Instruction::Load || Opcode == Instruction::Store)
323  return 32 * 4 / ElemWidth;
324  return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
325  : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
326  : 1;
327 }
328 
329 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
330  unsigned ChainSizeInBytes,
331  VectorType *VecTy) const {
332  unsigned VecRegBitWidth = VF * LoadSize;
333  if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
334  // TODO: Support element-size less than 32bit?
335  return 128 / LoadSize;
336 
337  return VF;
338 }
339 
340 unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
341  unsigned ChainSizeInBytes,
342  VectorType *VecTy) const {
343  unsigned VecRegBitWidth = VF * StoreSize;
344  if (VecRegBitWidth > 128)
345  return 128 / StoreSize;
346 
347  return VF;
348 }
349 
350 unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
351  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
352  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
353  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
354  AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
355  return 512;
356  }
357 
358  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
359  return 8 * ST->getMaxPrivateElementSize();
360 
361  // Common to flat, global, local and region. Assume for unknown addrspace.
362  return 128;
363 }
364 
365 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
366  Align Alignment,
367  unsigned AddrSpace) const {
368  // We allow vectorization of flat stores, even though we may need to decompose
369  // them later if they may access private memory. We don't have enough context
370  // here, and legalization can handle it.
371  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
372  return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
373  ChainSizeInBytes <= ST->getMaxPrivateElementSize();
374  }
375  return true;
376 }
377 
378 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
379  Align Alignment,
380  unsigned AddrSpace) const {
381  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
382 }
383 
384 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
385  Align Alignment,
386  unsigned AddrSpace) const {
387  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
388 }
389 
390 // FIXME: Really we would like to issue multiple 128-bit loads and stores per
391 // iteration. Should we report a larger size and let it legalize?
392 //
393 // FIXME: Should we use narrower types for local/region, or account for when
394 // unaligned access is legal?
395 //
396 // FIXME: This could use fine tuning and microbenchmarks.
398  unsigned SrcAddrSpace,
399  unsigned DestAddrSpace,
400  unsigned SrcAlign,
401  unsigned DestAlign) const {
402  unsigned MinAlign = std::min(SrcAlign, DestAlign);
403 
404  // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
405  // hardware into byte accesses. If you assume all alignments are equally
406  // probable, it's more efficient on average to use short accesses for this
407  // case.
408  if (MinAlign == 2)
409  return Type::getInt16Ty(Context);
410 
411  // Not all subtargets have 128-bit DS instructions, and we currently don't
412  // form them by default.
413  if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
414  SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
415  DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
416  DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
418  }
419 
420  // Global memory works best with 16-byte accesses. Private memory will also
421  // hit this, although they'll be decomposed.
423 }
424 
427  unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
428  unsigned SrcAlign, unsigned DestAlign) const {
429  assert(RemainingBytes < 16);
430 
431  unsigned MinAlign = std::min(SrcAlign, DestAlign);
432 
433  if (MinAlign != 2) {
434  Type *I64Ty = Type::getInt64Ty(Context);
435  while (RemainingBytes >= 8) {
436  OpsOut.push_back(I64Ty);
437  RemainingBytes -= 8;
438  }
439 
440  Type *I32Ty = Type::getInt32Ty(Context);
441  while (RemainingBytes >= 4) {
442  OpsOut.push_back(I32Ty);
443  RemainingBytes -= 4;
444  }
445  }
446 
447  Type *I16Ty = Type::getInt16Ty(Context);
448  while (RemainingBytes >= 2) {
449  OpsOut.push_back(I16Ty);
450  RemainingBytes -= 2;
451  }
452 
453  Type *I8Ty = Type::getInt8Ty(Context);
454  while (RemainingBytes) {
455  OpsOut.push_back(I8Ty);
456  --RemainingBytes;
457  }
458 }
459 
460 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
461  // Disable unrolling if the loop is not vectorized.
462  // TODO: Enable this again.
463  if (VF == 1)
464  return 1;
465 
466  return 8;
467 }
468 
470  MemIntrinsicInfo &Info) const {
471  switch (Inst->getIntrinsicID()) {
472  case Intrinsic::amdgcn_atomic_inc:
473  case Intrinsic::amdgcn_atomic_dec:
474  case Intrinsic::amdgcn_ds_ordered_add:
475  case Intrinsic::amdgcn_ds_ordered_swap:
476  case Intrinsic::amdgcn_ds_fadd:
477  case Intrinsic::amdgcn_ds_fmin:
478  case Intrinsic::amdgcn_ds_fmax: {
479  auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
480  auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
481  if (!Ordering || !Volatile)
482  return false; // Invalid.
483 
484  unsigned OrderingVal = Ordering->getZExtValue();
485  if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
486  return false;
487 
488  Info.PtrVal = Inst->getArgOperand(0);
489  Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
490  Info.ReadMem = true;
491  Info.WriteMem = true;
492  Info.IsVolatile = !Volatile->isNullValue();
493  return true;
494  }
495  default:
496  return false;
497  }
498 }
499 
500 int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
502  TTI::OperandValueKind Opd1Info,
503  TTI::OperandValueKind Opd2Info,
504  TTI::OperandValueProperties Opd1PropInfo,
505  TTI::OperandValueProperties Opd2PropInfo,
507  const Instruction *CxtI) {
508  EVT OrigTy = TLI->getValueType(DL, Ty);
509  if (!OrigTy.isSimple()) {
510  // FIXME: We're having to query the throughput cost so that the basic
511  // implementation tries to generate legalize and scalarization costs. Maybe
512  // we could hoist the scalarization code here?
515  Opd1Info, Opd2Info, Opd1PropInfo,
516  Opd2PropInfo, Args, CxtI);
517  // Scalarization
518 
519  // Check if any of the operands are vector operands.
520  int ISD = TLI->InstructionOpcodeToISD(Opcode);
521  assert(ISD && "Invalid opcode");
522 
523  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
524 
525  bool IsFloat = Ty->isFPOrFPVectorTy();
526  // Assume that floating point arithmetic operations cost twice as much as
527  // integer operations.
528  unsigned OpCost = (IsFloat ? 2 : 1);
529 
530  if (TLI->isOperationLegalOrPromote(ISD, LT.second)) {
531  // The operation is legal. Assume it costs 1.
532  // TODO: Once we have extract/insert subvector cost we need to use them.
533  return LT.first * OpCost;
534  }
535 
536  if (!TLI->isOperationExpand(ISD, LT.second)) {
537  // If the operation is custom lowered, then assume that the code is twice
538  // as expensive.
539  return LT.first * 2 * OpCost;
540  }
541 
542  // Else, assume that we need to scalarize this op.
543  // TODO: If one of the types get legalized by splitting, handle this
544  // similarly to what getCastInstrCost() does.
545  if (auto *VTy = dyn_cast<VectorType>(Ty)) {
546  unsigned Num = cast<FixedVectorType>(VTy)->getNumElements();
547  unsigned Cost = getArithmeticInstrCost(
548  Opcode, VTy->getScalarType(), CostKind, Opd1Info, Opd2Info,
549  Opd1PropInfo, Opd2PropInfo, Args, CxtI);
550  // Return the cost of multiple scalar invocation plus the cost of
551  // inserting and extracting the values.
552  SmallVector<Type *> Tys(Args.size(), Ty);
553  return getScalarizationOverhead(VTy, Args, Tys) + Num * Cost;
554  }
555 
556  // We don't know anything about this scalar instruction.
557  return OpCost;
558  }
559 
560  // Legalize the type.
561  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
562  int ISD = TLI->InstructionOpcodeToISD(Opcode);
563 
564  // Because we don't have any legal vector operations, but the legal types, we
565  // need to account for split vectors.
566  unsigned NElts = LT.second.isVector() ?
567  LT.second.getVectorNumElements() : 1;
568 
569  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
570 
571  switch (ISD) {
572  case ISD::SHL:
573  case ISD::SRL:
574  case ISD::SRA:
575  if (SLT == MVT::i64)
576  return get64BitInstrCost(CostKind) * LT.first * NElts;
577 
578  if (ST->has16BitInsts() && SLT == MVT::i16)
579  NElts = (NElts + 1) / 2;
580 
581  // i32
582  return getFullRateInstrCost() * LT.first * NElts;
583  case ISD::ADD:
584  case ISD::SUB:
585  case ISD::AND:
586  case ISD::OR:
587  case ISD::XOR:
588  if (SLT == MVT::i64) {
589  // and, or and xor are typically split into 2 VALU instructions.
590  return 2 * getFullRateInstrCost() * LT.first * NElts;
591  }
592 
593  if (ST->has16BitInsts() && SLT == MVT::i16)
594  NElts = (NElts + 1) / 2;
595 
596  return LT.first * NElts * getFullRateInstrCost();
597  case ISD::MUL: {
598  const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
599  if (SLT == MVT::i64) {
600  const int FullRateCost = getFullRateInstrCost();
601  return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
602  }
603 
604  if (ST->has16BitInsts() && SLT == MVT::i16)
605  NElts = (NElts + 1) / 2;
606 
607  // i32
608  return QuarterRateCost * NElts * LT.first;
609  }
610  case ISD::FMUL:
611  // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
612  // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
613  // fused operation.
614  if (CxtI && CxtI->hasOneUse())
615  if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
616  const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
617  if (OPC == ISD::FADD || OPC == ISD::FSUB) {
618  if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
620  if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
622 
623  // Estimate all types may be fused with contract/unsafe flags
624  const TargetOptions &Options = TLI->getTargetMachine().Options;
625  if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
626  Options.UnsafeFPMath ||
627  (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
629  }
630  }
632  case ISD::FADD:
633  case ISD::FSUB:
634  if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
635  NElts = (NElts + 1) / 2;
636  if (SLT == MVT::f64)
637  return LT.first * NElts * get64BitInstrCost(CostKind);
638 
639  if (ST->has16BitInsts() && SLT == MVT::f16)
640  NElts = (NElts + 1) / 2;
641 
642  if (SLT == MVT::f32 || SLT == MVT::f16)
643  return LT.first * NElts * getFullRateInstrCost();
644  break;
645  case ISD::FDIV:
646  case ISD::FREM:
647  // FIXME: frem should be handled separately. The fdiv in it is most of it,
648  // but the current lowering is also not entirely correct.
649  if (SLT == MVT::f64) {
650  int Cost = 7 * get64BitInstrCost(CostKind) +
651  getQuarterRateInstrCost(CostKind) +
652  3 * getHalfRateInstrCost(CostKind);
653  // Add cost of workaround.
655  Cost += 3 * getFullRateInstrCost();
656 
657  return LT.first * Cost * NElts;
658  }
659 
660  if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
661  // TODO: This is more complicated, unsafe flags etc.
662  if ((SLT == MVT::f32 && !HasFP32Denormals) ||
663  (SLT == MVT::f16 && ST->has16BitInsts())) {
664  return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
665  }
666  }
667 
668  if (SLT == MVT::f16 && ST->has16BitInsts()) {
669  // 2 x v_cvt_f32_f16
670  // f32 rcp
671  // f32 fmul
672  // v_cvt_f16_f32
673  // f16 div_fixup
674  int Cost =
675  4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
676  return LT.first * Cost * NElts;
677  }
678 
679  if (SLT == MVT::f32 || SLT == MVT::f16) {
680  // 4 more v_cvt_* insts without f16 insts support
681  int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
682  1 * getQuarterRateInstrCost(CostKind);
683 
684  if (!HasFP32Denormals) {
685  // FP mode switches.
686  Cost += 2 * getFullRateInstrCost();
687  }
688 
689  return LT.first * NElts * Cost;
690  }
691  break;
692  case ISD::FNEG:
693  // Use the backend' estimation. If fneg is not free each element will cost
694  // one additional instruction.
695  return TLI->isFNegFree(SLT) ? 0 : NElts;
696  default:
697  break;
698  }
699 
700  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
701  Opd1PropInfo, Opd2PropInfo, Args, CxtI);
702 }
703 
704 // Return true if there's a potential benefit from using v2f16/v2i16
705 // instructions for an intrinsic, even if it requires nontrivial legalization.
707  switch (ID) {
708  case Intrinsic::fma: // TODO: fmuladd
709  // There's a small benefit to using vector ops in the legalized code.
710  case Intrinsic::round:
711  case Intrinsic::uadd_sat:
712  case Intrinsic::usub_sat:
713  case Intrinsic::sadd_sat:
714  case Intrinsic::ssub_sat:
715  return true;
716  default:
717  return false;
718  }
719 }
720 
723  if (ICA.getID() == Intrinsic::fabs)
724  return 0;
725 
728 
729  Type *RetTy = ICA.getReturnType();
730  EVT OrigTy = TLI->getValueType(DL, RetTy);
731  if (!OrigTy.isSimple()) {
734 
735  // TODO: Combine these two logic paths.
736  if (ICA.isTypeBasedOnly())
738 
739  unsigned RetVF =
740  (RetTy->isVectorTy() ? cast<FixedVectorType>(RetTy)->getNumElements()
741  : 1);
742  const IntrinsicInst *I = ICA.getInst();
744  FastMathFlags FMF = ICA.getFlags();
745  // Assume that we need to scalarize this intrinsic.
746 
747  // Compute the scalarization overhead based on Args for a vector
748  // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
749  // CostModel will pass a vector RetTy and VF is 1.
750  unsigned ScalarizationCost = std::numeric_limits<unsigned>::max();
751  if (RetVF > 1) {
752  ScalarizationCost = 0;
753  if (!RetTy->isVoidTy())
754  ScalarizationCost +=
755  getScalarizationOverhead(cast<VectorType>(RetTy), true, false);
756  ScalarizationCost +=
758  }
759 
760  IntrinsicCostAttributes Attrs(ICA.getID(), RetTy, ICA.getArgTypes(), FMF, I,
761  ScalarizationCost);
763  }
764 
765  // Legalize the type.
766  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, RetTy);
767 
768  unsigned NElts = LT.second.isVector() ?
769  LT.second.getVectorNumElements() : 1;
770 
771  MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
772 
773  if (SLT == MVT::f64)
774  return LT.first * NElts * get64BitInstrCost(CostKind);
775 
776  if ((ST->has16BitInsts() && SLT == MVT::f16) ||
777  (ST->hasPackedFP32Ops() && SLT == MVT::f32))
778  NElts = (NElts + 1) / 2;
779 
780  // TODO: Get more refined intrinsic costs?
781  unsigned InstRate = getQuarterRateInstrCost(CostKind);
782 
783  switch (ICA.getID()) {
784  case Intrinsic::fma:
785  InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
786  : getQuarterRateInstrCost(CostKind);
787  break;
788  case Intrinsic::uadd_sat:
789  case Intrinsic::usub_sat:
790  case Intrinsic::sadd_sat:
791  case Intrinsic::ssub_sat:
792  static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
793  if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
794  NElts = 1;
795  break;
796  }
797 
798  return LT.first * NElts * InstRate;
799 }
800 
801 unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode,
804  return Opcode == Instruction::PHI ? 0 : 1;
805 
806  // XXX - For some reason this isn't called for switch.
807  switch (Opcode) {
808  case Instruction::Br:
809  case Instruction::Ret:
810  return 10;
811  default:
812  return BaseT::getCFInstrCost(Opcode, CostKind);
813  }
814 }
815 
817  bool IsPairwise,
819  EVT OrigTy = TLI->getValueType(DL, Ty);
820 
821  // Computes cost on targets that have packed math instructions(which support
822  // 16-bit types only).
823  if (IsPairwise ||
824  !ST->hasVOP3PInsts() ||
825  OrigTy.getScalarSizeInBits() != 16)
826  return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind);
827 
828  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
829  return LT.first * getFullRateInstrCost();
830 }
831 
833  bool IsPairwise, bool IsUnsigned,
835  EVT OrigTy = TLI->getValueType(DL, Ty);
836 
837  // Computes cost on targets that have packed math instructions(which support
838  // 16-bit types only).
839  if (IsPairwise ||
840  !ST->hasVOP3PInsts() ||
841  OrigTy.getScalarSizeInBits() != 16)
842  return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
843  CostKind);
844 
845  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
846  return LT.first * getHalfRateInstrCost(CostKind);
847 }
848 
849 int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
850  unsigned Index) {
851  switch (Opcode) {
852  case Instruction::ExtractElement:
853  case Instruction::InsertElement: {
854  unsigned EltSize
855  = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
856  if (EltSize < 32) {
857  if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
858  return 0;
859  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
860  }
861 
862  // Extracts are just reads of a subregister, so are free. Inserts are
863  // considered free because we don't want to have any cost for scalarizing
864  // operations, and we don't have to copy into a different register class.
865 
866  // Dynamic indexing isn't free and is best avoided.
867  return Index == ~0u ? 2 : 0;
868  }
869  default:
870  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
871  }
872 }
873 
874 /// Analyze if the results of inline asm are divergent. If \p Indices is empty,
875 /// this is analyzing the collective result of all output registers. Otherwise,
876 /// this is only querying a specific result index if this returns multiple
877 /// registers in a struct.
879  const CallInst *CI, ArrayRef<unsigned> Indices) const {
880  // TODO: Handle complex extract indices
881  if (Indices.size() > 1)
882  return true;
883 
884  const DataLayout &DL = CI->getModule()->getDataLayout();
885  const SIRegisterInfo *TRI = ST->getRegisterInfo();
886  TargetLowering::AsmOperandInfoVector TargetConstraints =
887  TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
888 
889  const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
890 
891  int OutputIdx = 0;
892  for (auto &TC : TargetConstraints) {
893  if (TC.Type != InlineAsm::isOutput)
894  continue;
895 
896  // Skip outputs we don't care about.
897  if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
898  continue;
899 
900  TLI->ComputeConstraintToUse(TC, SDValue());
901 
902  Register AssignedReg;
903  const TargetRegisterClass *RC;
904  std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
905  TRI, TC.ConstraintCode, TC.ConstraintVT);
906  if (AssignedReg) {
907  // FIXME: This is a workaround for getRegForInlineAsmConstraint
908  // returning VS_32
909  RC = TRI->getPhysRegClass(AssignedReg);
910  }
911 
912  // For AGPR constraints null is returned on subtargets without AGPRs, so
913  // assume divergent for null.
914  if (!RC || !TRI->isSGPRClass(RC))
915  return true;
916  }
917 
918  return false;
919 }
920 
921 /// \returns true if the new GPU divergence analysis is enabled.
923  return !UseLegacyDA;
924 }
925 
926 /// \returns true if the result of the value could potentially be
927 /// different across workitems in a wavefront.
929  if (const Argument *A = dyn_cast<Argument>(V))
930  return !AMDGPU::isArgPassedInSGPR(A);
931 
932  // Loads from the private and flat address spaces are divergent, because
933  // threads can execute the load instruction with the same inputs and get
934  // different results.
935  //
936  // All other loads are not divergent, because if threads issue loads with the
937  // same arguments, they will always get the same result.
938  if (const LoadInst *Load = dyn_cast<LoadInst>(V))
939  return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
940  Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
941 
942  // Atomics are divergent because they are executed sequentially: when an
943  // atomic operation refers to the same address in each thread, then each
944  // thread after the first sees the value written by the previous thread as
945  // original value.
946  if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
947  return true;
948 
949  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
950  return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
951 
952  // Assume all function calls are a source of divergence.
953  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
954  if (CI->isInlineAsm())
956  return true;
957  }
958 
959  // Assume all function calls are a source of divergence.
960  if (isa<InvokeInst>(V))
961  return true;
962 
963  return false;
964 }
965 
966 bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
967  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
968  switch (Intrinsic->getIntrinsicID()) {
969  default:
970  return false;
971  case Intrinsic::amdgcn_readfirstlane:
972  case Intrinsic::amdgcn_readlane:
973  case Intrinsic::amdgcn_icmp:
974  case Intrinsic::amdgcn_fcmp:
975  case Intrinsic::amdgcn_ballot:
976  case Intrinsic::amdgcn_if_break:
977  return true;
978  }
979  }
980 
981  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
982  if (CI->isInlineAsm())
983  return !isInlineAsmSourceOfDivergence(CI);
984  return false;
985  }
986 
987  const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
988  if (!ExtValue)
989  return false;
990 
991  const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
992  if (!CI)
993  return false;
994 
995  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
996  switch (Intrinsic->getIntrinsicID()) {
997  default:
998  return false;
999  case Intrinsic::amdgcn_if:
1000  case Intrinsic::amdgcn_else: {
1001  ArrayRef<unsigned> Indices = ExtValue->getIndices();
1002  return Indices.size() == 1 && Indices[0] == 1;
1003  }
1004  }
1005  }
1006 
1007  // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1008  // divergent for the overall struct return. We need to override it in the
1009  // case we're extracting an SGPR component here.
1010  if (CI->isInlineAsm())
1011  return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1012 
1013  return false;
1014 }
1015 
1017  Intrinsic::ID IID) const {
1018  switch (IID) {
1019  case Intrinsic::amdgcn_atomic_inc:
1020  case Intrinsic::amdgcn_atomic_dec:
1021  case Intrinsic::amdgcn_ds_fadd:
1022  case Intrinsic::amdgcn_ds_fmin:
1023  case Intrinsic::amdgcn_ds_fmax:
1024  case Intrinsic::amdgcn_is_shared:
1025  case Intrinsic::amdgcn_is_private:
1026  OpIndexes.push_back(0);
1027  return true;
1028  default:
1029  return false;
1030  }
1031 }
1032 
1034  Value *OldV,
1035  Value *NewV) const {
1036  auto IntrID = II->getIntrinsicID();
1037  switch (IntrID) {
1038  case Intrinsic::amdgcn_atomic_inc:
1039  case Intrinsic::amdgcn_atomic_dec:
1040  case Intrinsic::amdgcn_ds_fadd:
1041  case Intrinsic::amdgcn_ds_fmin:
1042  case Intrinsic::amdgcn_ds_fmax: {
1043  const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
1044  if (!IsVolatile->isZero())
1045  return nullptr;
1046  Module *M = II->getParent()->getParent()->getParent();
1047  Type *DestTy = II->getType();
1048  Type *SrcTy = NewV->getType();
1049  Function *NewDecl =
1050  Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
1051  II->setArgOperand(0, NewV);
1052  II->setCalledFunction(NewDecl);
1053  return II;
1054  }
1055  case Intrinsic::amdgcn_is_shared:
1056  case Intrinsic::amdgcn_is_private: {
1057  unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1059  unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1060  LLVMContext &Ctx = NewV->getType()->getContext();
1061  ConstantInt *NewVal = (TrueAS == NewAS) ?
1063  return NewVal;
1064  }
1065  case Intrinsic::ptrmask: {
1066  unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1067  unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1068  Value *MaskOp = II->getArgOperand(1);
1069  Type *MaskTy = MaskOp->getType();
1070 
1071  bool DoTruncate = false;
1072 
1073  const GCNTargetMachine &TM =
1074  static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1075  if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1076  // All valid 64-bit to 32-bit casts work by chopping off the high
1077  // bits. Any masking only clearing the low bits will also apply in the new
1078  // address space.
1079  if (DL.getPointerSizeInBits(OldAS) != 64 ||
1080  DL.getPointerSizeInBits(NewAS) != 32)
1081  return nullptr;
1082 
1083  // TODO: Do we need to thread more context in here?
1084  KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
1085  if (Known.countMinLeadingOnes() < 32)
1086  return nullptr;
1087 
1088  DoTruncate = true;
1089  }
1090 
1091  IRBuilder<> B(II);
1092  if (DoTruncate) {
1093  MaskTy = B.getInt32Ty();
1094  MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1095  }
1096 
1097  return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1098  {NewV, MaskOp});
1099  }
1100  default:
1101  return nullptr;
1102  }
1103 }
1104 
1106  int Index, VectorType *SubTp) {
1107  if (ST->hasVOP3PInsts()) {
1108  if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
1109  DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1110  // With op_sel VOP3P instructions freely can access the low half or high
1111  // half of a register, so any swizzle is free.
1112 
1113  switch (Kind) {
1114  case TTI::SK_Broadcast:
1115  case TTI::SK_Reverse:
1117  return 0;
1118  default:
1119  break;
1120  }
1121  }
1122  }
1123 
1124  return BaseT::getShuffleCost(Kind, VT, Index, SubTp);
1125 }
1126 
1128  const Function *Callee) const {
1129  const TargetMachine &TM = getTLI()->getTargetMachine();
1130  const GCNSubtarget *CallerST
1131  = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1132  const GCNSubtarget *CalleeST
1133  = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1134 
1135  const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1136  const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1137 
1138  FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1139  FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1140  if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1141  return false;
1142 
1143  // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1144  // no way to support merge for backend defined attributes.
1145  AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
1147  if (!CallerMode.isInlineCompatible(CalleeMode))
1148  return false;
1149 
1150  if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1151  Callee->hasFnAttribute(Attribute::InlineHint))
1152  return true;
1153 
1154  // Hack to make compile times reasonable.
1155  if (InlineMaxBB) {
1156  // Single BB does not increase total BB amount.
1157  if (Callee->size() == 1)
1158  return true;
1159  size_t BBSize = Caller->size() + Callee->size() - 1;
1160  return BBSize <= InlineMaxBB;
1161  }
1162 
1163  return true;
1164 }
1165 
1167  // If we have a pointer to private array passed into a function
1168  // it will not be optimized out, leaving scratch usage.
1169  // Increase the inline threshold to allow inlining in this case.
1170  uint64_t AllocaSize = 0;
1172  for (Value *PtrArg : CB->args()) {
1173  PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1174  if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
1176  continue;
1177 
1178  PtrArg = getUnderlyingObject(PtrArg);
1179  if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
1180  if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1181  continue;
1182  AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1183  // If the amount of stack memory is excessive we will not be able
1184  // to get rid of the scratch anyway, bail out.
1185  if (AllocaSize > ArgAllocaCutoff) {
1186  AllocaSize = 0;
1187  break;
1188  }
1189  }
1190  }
1191  if (AllocaSize)
1192  return ArgAllocaCost;
1193  return 0;
1194 }
1195 
1198  CommonTTI.getUnrollingPreferences(L, SE, UP);
1199 }
1200 
1203  CommonTTI.getPeelingPreferences(L, SE, PP);
1204 }
1205 
1206 int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1207  return ST->hasFullRate64Ops()
1208  ? getFullRateInstrCost()
1209  : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1210  : getQuarterRateInstrCost(CostKind);
1211 }
1212 
1214  : BaseT(TM, F.getParent()->getDataLayout()),
1215  ST(static_cast<const R600Subtarget *>(TM->getSubtargetImpl(F))),
1216  TLI(ST->getTargetLowering()), CommonTTI(TM, F) {}
1217 
1219  return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
1220 }
1221 
1222 unsigned R600TTIImpl::getNumberOfRegisters(bool Vec) const {
1223  return getHardwareNumberOfRegisters(Vec);
1224 }
1225 
1226 unsigned R600TTIImpl::getRegisterBitWidth(bool Vector) const {
1227  return 32;
1228 }
1229 
1231  return 32;
1232 }
1233 
1234 unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
1235  if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
1236  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS)
1237  return 128;
1238  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
1239  AddrSpace == AMDGPUAS::REGION_ADDRESS)
1240  return 64;
1241  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
1242  return 32;
1243 
1244  if ((AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
1245  AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
1246  (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
1247  AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
1248  return 128;
1249  llvm_unreachable("unhandled address space");
1250 }
1251 
1252 bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
1253  Align Alignment,
1254  unsigned AddrSpace) const {
1255  // We allow vectorization of flat stores, even though we may need to decompose
1256  // them later if they may access private memory. We don't have enough context
1257  // here, and legalization can handle it.
1258  return (AddrSpace != AMDGPUAS::PRIVATE_ADDRESS);
1259 }
1260 
1261 bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
1262  Align Alignment,
1263  unsigned AddrSpace) const {
1264  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1265 }
1266 
1267 bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
1268  Align Alignment,
1269  unsigned AddrSpace) const {
1270  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
1271 }
1272 
1274  // Disable unrolling if the loop is not vectorized.
1275  // TODO: Enable this again.
1276  if (VF == 1)
1277  return 1;
1278 
1279  return 8;
1280 }
1281 
1282 unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode,
1285  return Opcode == Instruction::PHI ? 0 : 1;
1286 
1287  // XXX - For some reason this isn't called for switch.
1288  switch (Opcode) {
1289  case Instruction::Br:
1290  case Instruction::Ret:
1291  return 10;
1292  default:
1293  return BaseT::getCFInstrCost(Opcode, CostKind);
1294  }
1295 }
1296 
1297 int R600TTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
1298  unsigned Index) {
1299  switch (Opcode) {
1300  case Instruction::ExtractElement:
1301  case Instruction::InsertElement: {
1302  unsigned EltSize
1303  = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
1304  if (EltSize < 32) {
1305  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1306  }
1307 
1308  // Extracts are just reads of a subregister, so are free. Inserts are
1309  // considered free because we don't want to have any cost for scalarizing
1310  // operations, and we don't have to copy into a different register class.
1311 
1312  // Dynamic indexing isn't free and is best avoided.
1313  return Index == ~0u ? 2 : 0;
1314  }
1315  default:
1316  return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
1317  }
1318 }
1319 
1322  CommonTTI.getUnrollingPreferences(L, SE, UP);
1323 }
1324 
1327  CommonTTI.getPeelingPreferences(L, SE, PP);
1328 }
llvm::ISD::SUB
@ SUB
Definition: ISDOpcodes.h:233
UseLegacyDA
static cl::opt< bool > UseLegacyDA("amdgpu-use-legacy-divergence-analysis", cl::desc("Enable legacy divergence analysis for AMDGPU"), cl::init(false), cl::Hidden)
llvm::getUnderlyingObject
Value * getUnderlyingObject(Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
Definition: ValueTracking.cpp:4165
llvm::EngineKind::Kind
Kind
Definition: ExecutionEngine.h:524
llvm::AMDGPUAS::CONSTANT_BUFFER_15
@ CONSTANT_BUFFER_15
Definition: AMDGPU.h:398
llvm::Argument
This class represents an incoming formal argument to a Function.
Definition: Argument.h:29
llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticInstrCost
unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:660
Attrs
Function Attrs
Definition: README_ALTIVEC.txt:215
llvm::Type::isSized
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:272
llvm::BasicTTIImplBase< AMDGPUTTIImpl >::DL
const DataLayout & DL
Definition: TargetTransformInfoImpl.h:38
llvm::TargetTransformInfo::UnrollingPreferences::Runtime
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
Definition: TargetTransformInfo.h:477
llvm::TargetTransformInfo::TargetCostKind
TargetCostKind
The kind of cost model.
Definition: TargetTransformInfo.h:210
llvm::Loop::isLoopInvariant
bool isLoopInvariant(const Value *V) const
Return true if the specified value is loop invariant.
Definition: LoopInfo.cpp:63
llvm::TargetTransformInfo::UnrollingPreferences::PartialThreshold
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
Definition: TargetTransformInfo.h:445
llvm
This class represents lattice values for constants.
Definition: AllocatorList.h:23
M
We currently emits eax Perhaps this is what we really should generate is Is imull three or four cycles eax eax The current instruction priority is based on pattern complexity The former is more complex because it folds a load so the latter will not be emitted Perhaps we should use AddedComplexity to give LEA32r a higher priority We should always try to match LEA first since the LEA matching code does some estimate to determine whether the match is profitable if we care more about code then imull is better It s two bytes shorter than movl leal On a Pentium M
Definition: README.txt:252
llvm::Instruction::getModule
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:65
llvm::none_of
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1512
llvm::SystemZISD::TM
@ TM
Definition: SystemZISelLowering.h:65
llvm::DataLayout
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:111
llvm::ISD::OR
@ OR
Definition: ISDOpcodes.h:589
llvm::Value::hasOneUse
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:437
llvm::Intrinsic::getDeclaration
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=None)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1254
llvm::TargetTransformInfo::UnrollingPreferences::MaxCount
unsigned MaxCount
Definition: TargetTransformInfo.h:461
llvm::BasicBlock::getParent
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:107
llvm::TargetOptions
Definition: TargetOptions.h:122
llvm::Function
Definition: Function.h:61
llvm::Loop
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:529
llvm::LoopBase::contains
bool contains(const LoopT *L) const
Return true if the specified loop is contained within in this loop.
Definition: LoopInfo.h:122
llvm::AMDGPUTargetLowering::isFNegFree
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
Definition: AMDGPUISelLowering.cpp:847
llvm::IntrinsicInst::getIntrinsicID
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:51
llvm::CallBase::setCalledFunction
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1432
llvm::DataLayout::getTypeSizeInBits
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:661
llvm::PatternMatch::m_FPOne
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:799
llvm::GCNTTIImpl::isSourceOfDivergence
bool isSourceOfDivergence(const Value *V) const
Definition: AMDGPUTargetTransformInfo.cpp:928
llvm::SmallVector
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1168
llvm::CallBase::isInlineAsm
bool isInlineAsm() const
Check if this call is an inline asm statement.
Definition: InstrTypes.h:1462
llvm::BasicTTIImplBase< GCNTTIImpl >::getOperandsScalarizationOverhead
unsigned getOperandsScalarizationOverhead(ArrayRef< const Value * > Args, ArrayRef< Type * > Tys)
Estimate the overhead of scalarizing an instructions unique non-constant operands.
Definition: BasicTTIImpl.h:615
llvm::Type::getPointerAddressSpace
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Definition: DerivedTypes.h:690
llvm::IRBuilder<>
llvm::IntrinsicCostAttributes::getReturnType
Type * getReturnType() const
Definition: TargetTransformInfo.h:147
llvm::PointerType::getAddressSpace
unsigned getAddressSpace() const
Return the address space of the Pointer type.
Definition: DerivedTypes.h:662
llvm::ScalarEvolution
The main scalar evolution driver.
Definition: ScalarEvolution.h:443
llvm::IntrinsicCostAttributes::getInst
const IntrinsicInst * getInst() const
Definition: TargetTransformInfo.h:146
ValueTracking.h
llvm::AtomicOrdering::SequentiallyConsistent
@ SequentiallyConsistent
llvm::R600TTIImpl::getHardwareNumberOfRegisters
unsigned getHardwareNumberOfRegisters(bool Vec) const
Definition: AMDGPUTargetTransformInfo.cpp:1218
llvm::Type::isFPOrFPVectorTy
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition: Type.h:190
llvm::TargetTransformInfo::TCK_CodeSize
@ TCK_CodeSize
Instruction code size.
Definition: TargetTransformInfo.h:213
llvm::GCNTTIImpl::getMemcpyLoopLoweringType
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign) const
Definition: AMDGPUTargetTransformInfo.cpp:397
llvm::cl::Hidden
@ Hidden
Definition: CommandLine.h:140
llvm::GCNTTIImpl::isLegalToVectorizeMemChain
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:365
llvm::R600TTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AMDGPUTargetTransformInfo.cpp:1325
llvm::Depth
@ Depth
Definition: SIMachineScheduler.h:34
llvm::AMDGPUAS::GLOBAL_ADDRESS
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:361
llvm::TargetTransformInfo::UnrollingPreferences::Partial
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
Definition: TargetTransformInfo.h:473
dependsOnLocalPhi
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
Definition: AMDGPUTargetTransformInfo.cpp:77
llvm::Type
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:46
llvm::GCNTTIImpl::getMaxInterleaveFactor
unsigned getMaxInterleaveFactor(unsigned VF)
Definition: AMDGPUTargetTransformInfo.cpp:460
llvm::TargetTransformInfo::PeelingPreferences
Definition: TargetTransformInfo.h:526
llvm::SITargetLowering::getRegForInlineAsmConstraint
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
Definition: SIISelLowering.cpp:11342
llvm::AMDGPU::SIModeRegisterDefaults
Definition: AMDGPUBaseInfo.h:867
llvm::GCNTTIImpl::areInlineCompatible
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
Definition: AMDGPUTargetTransformInfo.cpp:1127
llvm::FeatureBitset
Container class for subtarget features.
Definition: SubtargetFeature.h:40
llvm::SmallPtrSet
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:449
llvm::GCNSubtarget
Definition: GCNSubtarget.h:38
llvm::VectorType::getElementType
Type * getElementType() const
Definition: DerivedTypes.h:424
llvm::Value::user_begin
user_iterator user_begin()
Definition: Value.h:400
llvm::TargetLowering::ComputeConstraintToUse
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
Definition: TargetLowering.cpp:4957
llvm::MipsISD::Ret
@ Ret
Definition: MipsISelLowering.h:116
llvm::AMDGPU::SIModeRegisterDefaults::isInlineCompatible
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Definition: AMDGPUBaseInfo.h:952
llvm::AMDGPUAS::FLAT_ADDRESS
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:360
llvm::FastMathFlags
Convenience struct for specifying and reasoning about fast-math flags.
Definition: Operator.h:160
llvm::AMDGPU::IsaInfo::getMaxNumVGPRs
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
Definition: AMDGPUBaseInfo.cpp:586
llvm::GCNSubtarget::hasPackedFP32Ops
bool hasPackedFP32Ops() const
Definition: GCNSubtarget.h:823
llvm::AMDGPU::isIntrinsicSourceOfDivergence
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
Definition: AMDGPUBaseInfo.cpp:1813
llvm::TargetTransformInfo::SK_PermuteSingleSrc
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
Definition: TargetTransformInfo.h:850
llvm::Type::getInt8Ty
static IntegerType * getInt8Ty(LLVMContext &C)
Definition: Type.cpp:195
TRI
unsigned const TargetRegisterInfo * TRI
Definition: MachineSink.cpp:1541
llvm::BasicTTIImplBase< GCNTTIImpl >::getArithmeticReductionCost
unsigned getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, bool IsPairwise, TTI::TargetCostKind CostKind)
Try to calculate arithmetic and shuffle op costs for reduction operations.
Definition: BasicTTIImpl.h:1893
llvm::Type::getInt32Ty
static IntegerType * getInt32Ty(LLVMContext &C)
Definition: Type.cpp:197
llvm::R600TTIImpl::isLegalToVectorizeStoreChain
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:1267
llvm::GCNSubtarget::getRegisterInfo
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:232
llvm::ArrayRef::empty
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:158
LLVM_DEBUG
#define LLVM_DEBUG(X)
Definition: Debug.h:122
llvm::TargetTransformInfo::SK_Broadcast
@ SK_Broadcast
Broadcast element 0 to all other elements.
Definition: TargetTransformInfo.h:840
F
#define F(x, y, z)
Definition: MD5.cpp:56
llvm::GCNTTIImpl::getCFInstrCost
unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind)
Definition: AMDGPUTargetTransformInfo.cpp:801
KnownBits.h
llvm::AMDGPUAS::CONSTANT_ADDRESS
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:364
llvm::BasicBlock
LLVM Basic Block Representation.
Definition: BasicBlock.h:58
llvm::IntrinsicCostAttributes::getFlags
FastMathFlags getFlags() const
Definition: TargetTransformInfo.h:148
llvm::EVT::isSimple
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition: ValueTypes.h:124
llvm::MVT::SimpleValueType
SimpleValueType
Definition: MachineValueType.h:32
llvm::AArch64CC::LT
@ LT
Definition: AArch64BaseInfo.h:247
Context
LLVMContext & Context
Definition: NVVMIntrRange.cpp:66
llvm::dbgs
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:132
llvm::AMDGPUTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: AMDGPUTargetTransformInfo.cpp:102
llvm::IntrinsicCostAttributes::getArgTypes
const SmallVectorImpl< Type * > & getArgTypes() const
Definition: TargetTransformInfo.h:151
llvm::ConstantInt
This is the shared class of boolean and integer constants.
Definition: Constants.h:77
llvm::AllocaInst::isStaticAlloca
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Definition: Instructions.cpp:1361
llvm::LoopBase::getSubLoops
const std::vector< LoopT * > & getSubLoops() const
Return the loops contained entirely within this loop.
Definition: LoopInfo.h:143
llvm::MinAlign
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition: MathExtras.h:673
llvm::AMDGPUTargetMachine
Definition: AMDGPUTargetMachine.h:27
llvm::GCNSubtarget::hasFastFMAF32
bool hasFastFMAF32() const
Definition: GCNSubtarget.h:302
llvm::PatternMatch::match
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
llvm::AllocaInst::getAllocatedType
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:112
llvm::TargetTransformInfo::ShuffleKind
ShuffleKind
The various kinds of shuffle patterns for vector queries.
Definition: TargetTransformInfo.h:839
llvm::R600TTIImpl::getMaxInterleaveFactor
unsigned getMaxInterleaveFactor(unsigned VF)
Definition: AMDGPUTargetTransformInfo.cpp:1273
llvm::EVT
Extended Value Type.
Definition: ValueTypes.h:35
llvm::GCNTTIImpl::isLegalToVectorizeStoreChain
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:384
llvm::MVT::f64
@ f64
Definition: MachineValueType.h:53
round
static uint64_t round(uint64_t Acc, uint64_t Input)
Definition: xxhash.cpp:57
llvm::SITargetLowering::getTypeLegalizationCost
std::pair< int, MVT > getTypeLegalizationCost(const DataLayout &DL, Type *Ty) const
Definition: SIISelLowering.cpp:12082
llvm::AMDGPUAS::PARAM_D_ADDRESS
@ PARAM_D_ADDRESS
Address space for direct addressible parameter memory (CONST0).
Definition: AMDGPU.h:373
llvm::GCNTTIImpl::getMinVectorRegisterBitWidth
unsigned getMinVectorRegisterBitWidth() const
Definition: AMDGPUTargetTransformInfo.cpp:317
llvm::TargetRegisterClass
Definition: TargetRegisterInfo.h:46
llvm::ISD::SRA
@ SRA
Definition: ISDOpcodes.h:614
llvm::GCNTTIImpl::getMemcpyLoopResidualLoweringType
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign) const
Definition: AMDGPUTargetTransformInfo.cpp:425
llvm::Type::isVectorTy
bool isVectorTy() const
True if this is an instance of VectorType.
Definition: Type.h:235
llvm::TargetTransformInfo::UnrollingPreferences::MaxIterationsCountToAnalyze
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
Definition: TargetTransformInfo.h:499
llvm::GCNSubtarget::getMaxPrivateElementSize
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:280
llvm::GCNTTIImpl::getLoadVectorFactor
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: AMDGPUTargetTransformInfo.cpp:329
B
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
llvm::LoopBase::getBlocks
ArrayRef< BlockT * > getBlocks() const
Get a list of the basic blocks which make up this loop.
Definition: LoopInfo.h:171
llvm::Instruction
Definition: Instruction.h:45
llvm::Type::getScalarSizeInBits
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition: Type.cpp:147
llvm::GCNTTIImpl::getArithmeticReductionCost
int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, bool IsPairwise, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
Definition: AMDGPUTargetTransformInfo.cpp:816
llvm::BasicTTIImplBase< GCNTTIImpl >::getShuffleCost
unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, VectorType *SubTp)
Definition: BasicTTIImpl.h:718
Info
Analysis containing CSE Info
Definition: CSEInfo.cpp:26
llvm::SIRegisterInfo
Definition: SIRegisterInfo.h:27
llvm::GCNTTIImpl::isAlwaysUniform
bool isAlwaysUniform(const Value *V) const
Definition: AMDGPUTargetTransformInfo.cpp:966
llvm::ISD::AND
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:588
llvm::GCNTTIImpl::getShuffleCost
unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index, VectorType *SubTp)
Definition: AMDGPUTargetTransformInfo.cpp:1105
UnrollThresholdIf
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(150), cl::Hidden)
llvm::AMDGPUSubtarget::has16BitInsts
bool has16BitInsts() const
Definition: AMDGPUSubtarget.h:132
PatternMatch.h
llvm::FixedVectorType::get
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:643
llvm::Align
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
llvm::AMDGPUTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AMDGPUTargetTransformInfo.cpp:255
llvm::MVT::v4i16
@ v4i16
Definition: MachineValueType.h:86
llvm::IntrinsicCostAttributes
Definition: TargetTransformInfo.h:115
LoopInfo.h
InlineMaxBB
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
AMDGPUTargetTransformInfo.h
llvm::GCNTTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: AMDGPUTargetTransformInfo.cpp:1196
llvm::SPII::Store
@ Store
Definition: SparcInstrInfo.h:33
llvm::ISD::FADD
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:371
llvm::TargetTransformInfo::SK_Reverse
@ SK_Reverse
Reverse the order of the vector.
Definition: TargetTransformInfo.h:841
llvm::VectorType
Base class of all SIMD vector types.
Definition: DerivedTypes.h:391
llvm::AtomicOrdering
AtomicOrdering
Atomic ordering for LLVM's memory model.
Definition: AtomicOrdering.h:56
llvm::cl::opt
Definition: CommandLine.h:1419
llvm::R600Subtarget
Definition: R600Subtarget.h:36
llvm::AMDGPUAS::CONSTANT_BUFFER_0
@ CONSTANT_BUFFER_0
Definition: AMDGPU.h:383
llvm::GCNTTIImpl::getTgtMemIntrinsic
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
Definition: AMDGPUTargetTransformInfo.cpp:469
llvm::TargetRegisterInfo::getRegClass
const TargetRegisterClass * getRegClass(unsigned i) const
Returns the register class associated with the enumeration value.
Definition: TargetRegisterInfo.h:723
llvm::AMDGPUAS::LOCAL_ADDRESS
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:365
Index
uint32_t Index
Definition: ELFObjHandler.cpp:84
llvm::SPII::Load
@ Load
Definition: SparcInstrInfo.h:32
llvm::KnownBits::countMinLeadingOnes
unsigned countMinLeadingOnes() const
Returns the minimum number of leading one bits.
Definition: KnownBits.h:234
llvm::AMDGPUTTIImpl::AMDGPUTTIImpl
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition: AMDGPUTargetTransformInfo.cpp:96
llvm::Instruction::hasAllowContract
bool hasAllowContract() const
Determine whether the allow-contract flag is set.
Definition: Instruction.cpp:248
llvm::GlobalValue::getParent
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:572
llvm::R600TTIImpl::isLegalToVectorizeMemChain
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:1252
const
aarch64 promote const
Definition: AArch64PromoteConstant.cpp:232
llvm::ARM_MB::ST
@ ST
Definition: ARMBaseInfo.h:73
llvm::GCNTTIImpl::getMinMaxReductionCost
int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput)
Definition: AMDGPUTargetTransformInfo.cpp:832
llvm::LLVMContext
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:68
llvm::TargetLoweringBase::isOperationLegalOrPromote
bool isOperationLegalOrPromote(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal using promotion.
Definition: TargetLowering.h:1118
llvm::TargetLowering::ParseConstraints
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
Definition: TargetLowering.cpp:4607
llvm::AMDGPU::getIntegerAttribute
int getIntegerAttribute(const Function &F, StringRef Name, int Default)
Definition: AMDGPUBaseInfo.cpp:693
llvm::TargetTransformInfo::UnrollingPreferences
Parameters that control the generic loop unrolling transformation.
Definition: TargetTransformInfo.h:420
I
#define I(x, y, z)
Definition: MD5.cpp:59
llvm::TargetTransformInfo::OperandValueProperties
OperandValueProperties
Additional properties of an operand's values.
Definition: TargetTransformInfo.h:897
llvm::GetElementPtrInst
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:905
llvm::cl::init
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:440
llvm::AMDGPUSubtarget::hasMadMacF32Insts
bool hasMadMacF32Insts() const
Definition: AMDGPUSubtarget.h:140
llvm::PointerType
Class to represent pointers.
Definition: DerivedTypes.h:634
llvm::BasicTTIImplBase< AMDGPUTTIImpl >
intrinsicHasPackedVectorBenefit
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
Definition: AMDGPUTargetTransformInfo.cpp:706
llvm::computeKnownBits
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, OptimizationRemarkEmitter *ORE=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
Definition: ValueTracking.cpp:238
llvm::GCNTTIImpl::getArithmeticInstrCost
int getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueKind Opd1Info=TTI::OK_AnyValue, TTI::OperandValueKind Opd2Info=TTI::OK_AnyValue, TTI::OperandValueProperties Opd1PropInfo=TTI::OP_None, TTI::OperandValueProperties Opd2PropInfo=TTI::OP_None, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: AMDGPUTargetTransformInfo.cpp:500
llvm::TargetMachine::Options
TargetOptions Options
Definition: TargetMachine.h:115
assert
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
llvm::TargetMachine
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:77
llvm::LoopBase::getLoopDepth
unsigned getLoopDepth() const
Return the nesting level of this loop.
Definition: LoopInfo.h:96
llvm::AMDGPUAS::REGION_ADDRESS
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:362
llvm::R600TTIImpl::getUnrollingPreferences
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP)
Definition: AMDGPUTargetTransformInfo.cpp:1320
llvm::GCNSubtarget::hasUnalignedScratchAccess
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:509
Mode
SI Whole Quad Mode
Definition: SIWholeQuadMode.cpp:262
llvm::Type::isVoidTy
bool isVoidTy() const
Return true if this is 'void'.
Definition: Type.h:139
UnrollMaxBlockToAnalyze
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
llvm::MVT
Machine Value Type.
Definition: MachineValueType.h:30
llvm::Module
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:67
llvm::MDNode
Metadata node.
Definition: Metadata.h:893
llvm::TargetTransformInfo::OperandValueKind
OperandValueKind
Additional information about an operand's possible values.
Definition: TargetTransformInfo.h:889
llvm::AMDGPUAS::BUFFER_FAT_POINTER
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
Definition: AMDGPU.h:370
llvm::GCNTTIImpl::getHardwareNumberOfRegisters
unsigned getHardwareNumberOfRegisters(bool Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:294
UnrollThresholdPrivate
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
llvm::R600TTIImpl::isLegalToVectorizeLoadChain
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:1261
llvm::GCNTargetMachine
Definition: AMDGPUTargetMachine.h:95
llvm::TargetTransformInfo::TCC_Free
@ TCC_Free
Expected to fold away in lowering.
Definition: TargetTransformInfo.h:263
llvm::R600TTIImpl::getCFInstrCost
unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind)
Definition: AMDGPUTargetTransformInfo.cpp:1282
llvm::AMDGPUSubtarget::hasVOP3PInsts
bool hasVOP3PInsts() const
Definition: AMDGPUSubtarget.h:152
llvm::ArrayRef
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: APInt.h:32
llvm::min
Expected< ExpressionValue > min(const ExpressionValue &Lhs, const ExpressionValue &Rhs)
Definition: FileCheck.cpp:339
llvm::any_of
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1505
llvm::MVT::i64
@ i64
Definition: MachineValueType.h:44
Cond
SmallVector< MachineOperand, 4 > Cond
Definition: BasicBlockSections.cpp:167
llvm::EVT::getScalarSizeInBits
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:348
llvm::GCNTTIImpl::getIntrinsicInstrCost
int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Definition: AMDGPUTargetTransformInfo.cpp:721
llvm::TargetTransformInfo::TCK_SizeAndLatency
@ TCK_SizeAndLatency
The weighted sum of size and latency.
Definition: TargetTransformInfo.h:214
llvm::GCNTTIImpl::isLegalToVectorizeLoadChain
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:378
llvm_unreachable
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Definition: ErrorHandling.h:136
llvm::Value::getType
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:246
llvm::GCNTTIImpl::isInlineAsmSourceOfDivergence
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
Definition: AMDGPUTargetTransformInfo.cpp:878
CostKind
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
getParent
static const Function * getParent(const Value *V)
Definition: BasicAliasAnalysis.cpp:780
llvm::TargetLoweringBase::InstructionOpcodeToISD
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
Definition: TargetLoweringBase.cpp:1744
llvm::AMDGPU::isGraphics
bool isGraphics(CallingConv::ID cc)
Definition: AMDGPUBaseInfo.cpp:1203
llvm::findOptionMDForLoop
MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
Definition: LoopInfo.cpp:1013
llvm::ConstantInt::getSExtValue
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:152
llvm::BasicTTIImplBase< AMDGPUTTIImpl >::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:470
llvm::AMDGPU::HSAMD::Kernel::Arg::Key::IsVolatile
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
Definition: AMDGPUMetadata.h:184
LLVM_FALLTHROUGH
#define LLVM_FALLTHROUGH
LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements.
Definition: Compiler.h:280
llvm::Type::getContext
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:128
llvm::LoadInst
An instruction for reading from memory.
Definition: Instructions.h:174
llvm::ISD::FMUL
@ FMUL
Definition: ISDOpcodes.h:373
llvm::CallBase::setArgOperand
void setArgOperand(unsigned i, Value *v)
Definition: InstrTypes.h:1346
UnrollThresholdLocal
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
llvm::ConstantInt::getFalse
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:831
llvm::Register
Wrapper class representing virtual and physical registers.
Definition: Register.h:19
llvm::ISD::XOR
@ XOR
Definition: ISDOpcodes.h:590
Callee
amdgpu Simplify well known AMD library false FunctionCallee Callee
Definition: AMDGPULibCalls.cpp:205
llvm::TargetLoweringBase::getTargetMachine
const TargetMachine & getTargetMachine() const
Definition: TargetLowering.h:335
llvm::LoopBase::isInnermost
bool isInnermost() const
Return true if the loop does not contain any (natural) loops.
Definition: LoopInfo.h:165
ArgAllocaCost
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
llvm::R600TTIImpl::getLoadStoreVecRegBitWidth
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:1234
llvm::GCNTTIImpl::getMaximumVF
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
Definition: AMDGPUTargetTransformInfo.cpp:321
llvm::Type::getInt64Ty
static IntegerType * getInt64Ty(LLVMContext &C)
Definition: Type.cpp:198
llvm::GCNTTIImpl::getNumberOfRegisters
unsigned getNumberOfRegisters(bool Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:300
llvm::AMDGPU::isArgPassedInSGPR
bool isArgPassedInSGPR(const Argument *A)
Definition: AMDGPUBaseInfo.cpp:1624
llvm::ConstantInt::getTrue
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:824
std
Definition: BitVector.h:941
llvm::KnownBits
Definition: KnownBits.h:23
llvm::BasicTTIImplBase< GCNTTIImpl >::getCFInstrCost
unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind)
Definition: BasicTTIImpl.h:898
llvm::TargetRegisterInfo::getRegSizeInBits
unsigned getRegSizeInBits(const TargetRegisterClass &RC) const
Return the size in bits of a register from class RC.
Definition: TargetRegisterInfo.h:274
llvm::GCNSubtarget::hasFullRate64Ops
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:310
llvm::GCNTTIImpl::collectFlatAddressOperands
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
Definition: AMDGPUTargetTransformInfo.cpp:1016
llvm::AMDGPU::SendMsg::Op
Op
Definition: SIDefines.h:302
llvm::GCNTTIImpl::getLoadStoreVecRegBitWidth
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
Definition: AMDGPUTargetTransformInfo.cpp:350
llvm::ExtractValueInst
This instruction extracts a struct member or array element value from an aggregate value.
Definition: Instructions.h:2318
llvm::GCNTTIImpl::getStoreVectorFactor
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Definition: AMDGPUTargetTransformInfo.cpp:340
llvm::BasicTTIImplBase< GCNTTIImpl >::getIntrinsicInstrCost
unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
Definition: BasicTTIImpl.h:1196
llvm::R600TTIImpl::getRegisterBitWidth
unsigned getRegisterBitWidth(bool Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:1226
llvm::TargetLoweringBase::isOperationExpand
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
Definition: TargetLowering.h:1203
llvm::LoopBase::getHeader
BlockT * getHeader() const
Definition: LoopInfo.h:104
llvm::GCNTTIImpl::adjustInliningThreshold
unsigned adjustInliningThreshold(const CallBase *CB) const
Definition: AMDGPUTargetTransformInfo.cpp:1166
llvm::SDValue
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
Definition: SelectionDAGNodes.h:138
llvm::ISD::ADD
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:232
llvm::IntrinsicInst
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:44
llvm::GCNTTIImpl::rewriteIntrinsicWithAddressSpace
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
Definition: AMDGPUTargetTransformInfo.cpp:1033
llvm::R600TTIImpl::getVectorInstrCost
int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index)
Definition: AMDGPUTargetTransformInfo.cpp:1297
Vector
So we should use XX3Form_Rcr to implement instrinsic Convert DP outs ins xscvdpsp No builtin are required Round &Convert QP DP(dword[1] is set to zero) No builtin are required Round to Quad Precision because you need to assign rounding mode in instruction Provide builtin(set f128:$vT,(int_ppc_vsx_xsrqpi f128:$vB))(set f128 yields< n x< ty > >< result > yields< ty >< result > No builtin are required Load Store Vector
Definition: README_P9.txt:497
llvm::R600TTIImpl::getMinVectorRegisterBitWidth
unsigned getMinVectorRegisterBitWidth() const
Definition: AMDGPUTargetTransformInfo.cpp:1230
llvm::InlineAsm::isOutput
@ isOutput
Definition: InlineAsm.h:93
llvm::FPOpFusion::Fast
@ Fast
Definition: TargetOptions.h:37
llvm::RecurKind::FAdd
@ FAdd
Sum of floats.
llvm::ISD::FSUB
@ FSUB
Definition: ISDOpcodes.h:372
llvm::IntrinsicCostAttributes::getID
Intrinsic::ID getID() const
Definition: TargetTransformInfo.h:145
llvm::ISD::SHL
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:613
llvm::ISD::FREM
@ FREM
Definition: ISDOpcodes.h:375
llvm::ISD::MUL
@ MUL
Definition: ISDOpcodes.h:234
llvm::BasicTTIImplBase< GCNTTIImpl >::getTypeBasedIntrinsicInstrCost
unsigned getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on argument types.
Definition: BasicTTIImpl.h:1367
llvm::MVT::f16
@ f16
Definition: MachineValueType.h:51
llvm::CallBase::getArgOperand
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1341
llvm::R600TTIImpl::R600TTIImpl
R600TTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition: AMDGPUTargetTransformInfo.cpp:1213
llvm::TargetTransformInfo::UnrollingPreferences::Threshold
unsigned Threshold
The cost threshold for the unrolled loop.
Definition: TargetTransformInfo.h:428
llvm::Instruction::getParent
const BasicBlock * getParent() const
Definition: Instruction.h:94
llvm::ISD::SRL
@ SRL
Definition: ISDOpcodes.h:615
llvm::AMDGPUAS::PRIVATE_ADDRESS
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:366
llvm::AMDGPU::IsaInfo::getWavesPerEUForWorkGroup
unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize)
Definition: AMDGPUBaseInfo.cpp:412
llvm::ArrayRef::size
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:163
llvm::max
Align max(MaybeAlign Lhs, Align Rhs)
Definition: Alignment.h:350
llvm::GCNTTIImpl::GCNTTIImpl
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
Definition: AMDGPUTargetTransformInfo.cpp:280
llvm::PHINode
Definition: Instructions.h:2572
Threshold
static cl::opt< unsigned > Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"), cl::init(100), cl::Hidden)
llvm::SmallVectorImpl
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: APFloat.h:43
llvm::TargetOptions::UnsafeFPMath
unsigned UnsafeFPMath
UnsafeFPMath - This flag is enabled when the -enable-unsafe-fp-math flag is specified on the command ...
Definition: TargetOptions.h:158
llvm::MemIntrinsicInfo
Information about a load/store intrinsic defined by the target.
Definition: TargetTransformInfo.h:67
llvm::CallBase
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1164
llvm::Type::getInt16Ty
static IntegerType * getInt16Ty(LLVMContext &C)
Definition: Type.cpp:196
llvm::Module::getDataLayout
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.cpp:397
llvm::DataLayout::getPointerSizeInBits
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:403
llvm::AMDGPU::HSAMD::Kernel::CodeProps::Key::NumVGPRs
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
Definition: AMDGPUMetadata.h:245
llvm::MVT::i16
@ i16
Definition: MachineValueType.h:42
llvm::LoopBase::isLoopExiting
bool isLoopExiting(const BlockT *BB) const
True if terminator in the block can branch to another block that is outside of the current loop.
Definition: LoopInfo.h:225
llvm::BasicTTIImplBase< GCNTTIImpl >::getScalarizationOverhead
unsigned getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract)
Estimate the overhead of scalarizing an instruction.
Definition: BasicTTIImpl.h:580
llvm::CallInst
This class represents a function call, abstracting a target machine's calling convention.
Definition: Instructions.h:1450
llvm::ISD::FNEG
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:822
BB
Common register allocation spilling lr str ldr sxth r3 ldr mla r4 can lr mov lr str ldr sxth r3 mla r4 and then merge mul and lr str ldr sxth r3 mla r4 It also increase the likelihood the store may become dead bb27 Successors according to LLVM BB
Definition: README.txt:39
GEP
Hexagon Common GEP
Definition: HexagonCommonGEP.cpp:171
llvm::AMDGPU::HSAMD::Kernel::Key::Args
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
Definition: AMDGPUMetadata.h:379
llvm::GCNTTIImpl::useGPUDivergenceAnalysis
bool useGPUDivergenceAnalysis() const
Definition: AMDGPUTargetTransformInfo.cpp:922
llvm::AllocaInst
an instruction to allocate memory on the stack
Definition: Instructions.h:61
llvm::User::getOperand
Value * getOperand(unsigned i) const
Definition: User.h:169
llvm::cl::desc
Definition: CommandLine.h:411
llvm::TargetLoweringBase::getValueType
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
Definition: TargetLowering.h:1386
ArgAllocaCutoff
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
llvm::IntrinsicCostAttributes::isTypeBasedOnly
bool isTypeBasedOnly() const
Definition: TargetTransformInfo.h:153
llvm::BranchInst
Conditional or Unconditional Branch instruction.
Definition: Instructions.h:3005
llvm::BasicTTIImplBase< GCNTTIImpl >::getMinMaxReductionCost
unsigned getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsPairwise, bool IsUnsigned, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
Definition: BasicTTIImpl.h:1940
llvm::MVT::v2i16
@ v2i16
Definition: MachineValueType.h:84
llvm::GCNTTIImpl::getVectorInstrCost
int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index)
Definition: AMDGPUTargetTransformInfo.cpp:849
llvm::TargetOptions::AllowFPOpFusion
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fuse-fp-ops=xxx option.
Definition: TargetOptions.h:367
llvm::MVT::f32
@ f32
Definition: MachineValueType.h:52
UnrollRuntimeLocal
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
llvm::AMDGPUAS::CONSTANT_ADDRESS_32BIT
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:368
llvm::ExtractValueInst::getIndices
ArrayRef< unsigned > getIndices() const
Definition: Instructions.h:2383
llvm::R600TTIImpl::getNumberOfRegisters
unsigned getNumberOfRegisters(bool Vec) const
Definition: AMDGPUTargetTransformInfo.cpp:1222
llvm::Value
LLVM Value Representation.
Definition: Value.h:75
llvm::TargetTransformInfo::TCK_RecipThroughput
@ TCK_RecipThroughput
Reciprocal throughput.
Definition: TargetTransformInfo.h:211
llvm::IntrinsicCostAttributes::getArgs
const SmallVectorImpl< const Value * > & getArgs() const
Definition: TargetTransformInfo.h:150
AMDGPUTargetMachine.h
llvm::CallBase::args
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1322
llvm::AMDGPUAS::PARAM_I_ADDRESS
@ PARAM_I_ADDRESS
Address space for indirect addressible parameter memory (VTX1).
Definition: AMDGPU.h:375
llvm::TargetLowering::AsmOperandInfoVector
std::vector< AsmOperandInfo > AsmOperandInfoVector
Definition: TargetLowering.h:4160
llvm::GCNSubtarget::hasUsableDivScaleConditionOutput
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:403
llvm::SmallPtrSetImpl::insert
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:364
llvm::Intrinsic::ID
unsigned ID
Definition: TargetTransformInfo.h:37
llvm::BasicTTIImplBase< GCNTTIImpl >::getVectorInstrCost
unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index)
Definition: BasicTTIImpl.h:949
llvm::DataLayout::getTypeAllocSize
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:497
llvm::GCNTTIImpl::getRegisterBitWidth
unsigned getRegisterBitWidth(bool Vector) const
Definition: AMDGPUTargetTransformInfo.cpp:313
llvm::ISD::FDIV
@ FDIV
Definition: ISDOpcodes.h:374
llvm::GCNTTIImpl::getPeelingPreferences
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: AMDGPUTargetTransformInfo.cpp:1201