LLVM 18.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
18#include "AMDGPUTargetMachine.h"
25#include "llvm/IR/IRBuilder.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
29#include <optional>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "AMDGPUtti"
34
36 "amdgpu-unroll-threshold-private",
37 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
38 cl::init(2700), cl::Hidden);
39
41 "amdgpu-unroll-threshold-local",
42 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
43 cl::init(1000), cl::Hidden);
44
46 "amdgpu-unroll-threshold-if",
47 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
48 cl::init(200), cl::Hidden);
49
51 "amdgpu-unroll-runtime-local",
52 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
53 cl::init(true), cl::Hidden);
54
56 "amdgpu-unroll-max-block-to-analyze",
57 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
58 cl::init(32), cl::Hidden);
59
60static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
61 cl::Hidden, cl::init(4000),
62 cl::desc("Cost of alloca argument"));
63
64// If the amount of scratch memory to eliminate exceeds our ability to allocate
65// it into registers we gain nothing by aggressively inlining functions for that
66// heuristic.
68 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
69 cl::init(256),
70 cl::desc("Maximum alloca size to use for inline cost"));
71
72// Inliner constraint to achieve reasonable compilation time.
74 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
75 cl::desc("Maximum number of BBs allowed in a function after inlining"
76 " (compile time constraint)"));
77
78static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
79 unsigned Depth = 0) {
80 const Instruction *I = dyn_cast<Instruction>(Cond);
81 if (!I)
82 return false;
83
84 for (const Value *V : I->operand_values()) {
85 if (!L->contains(I))
86 continue;
87 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
88 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
89 return SubLoop->contains(PHI); }))
90 return true;
91 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
92 return true;
93 }
94 return false;
95}
96
98 : BaseT(TM, F.getParent()->getDataLayout()),
99 TargetTriple(TM->getTargetTriple()),
100 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
101 TLI(ST->getTargetLowering()) {}
102
106 const Function &F = *L->getHeader()->getParent();
107 UP.Threshold =
108 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
109 UP.MaxCount = std::numeric_limits<unsigned>::max();
110 UP.Partial = true;
111
112 // Conditional branch in a loop back edge needs 3 additional exec
113 // manipulations in average.
114 UP.BEInsns += 3;
115
116 // We want to run unroll even for the loops which have been vectorized.
117 UP.UnrollVectorizedLoop = true;
118
119 // TODO: Do we want runtime unrolling?
120
121 // Maximum alloca size than can fit registers. Reserve 16 registers.
122 const unsigned MaxAlloca = (256 - 16) * 4;
123 unsigned ThresholdPrivate = UnrollThresholdPrivate;
124 unsigned ThresholdLocal = UnrollThresholdLocal;
125
126 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
127 // provided threshold value as the default for Threshold
128 if (MDNode *LoopUnrollThreshold =
129 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
130 if (LoopUnrollThreshold->getNumOperands() == 2) {
131 ConstantInt *MetaThresholdValue = mdconst::extract_or_null<ConstantInt>(
132 LoopUnrollThreshold->getOperand(1));
133 if (MetaThresholdValue) {
134 // We will also use the supplied value for PartialThreshold for now.
135 // We may introduce additional metadata if it becomes necessary in the
136 // future.
137 UP.Threshold = MetaThresholdValue->getSExtValue();
139 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
140 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
141 }
142 }
143 }
144
145 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
146 for (const BasicBlock *BB : L->getBlocks()) {
147 const DataLayout &DL = BB->getModule()->getDataLayout();
148 unsigned LocalGEPsSeen = 0;
149
150 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
151 return SubLoop->contains(BB); }))
152 continue; // Block belongs to an inner loop.
153
154 for (const Instruction &I : *BB) {
155 // Unroll a loop which contains an "if" statement whose condition
156 // defined by a PHI belonging to the loop. This may help to eliminate
157 // if region and potentially even PHI itself, saving on both divergence
158 // and registers used for the PHI.
159 // Add a small bonus for each of such "if" statements.
160 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
161 if (UP.Threshold < MaxBoost && Br->isConditional()) {
162 BasicBlock *Succ0 = Br->getSuccessor(0);
163 BasicBlock *Succ1 = Br->getSuccessor(1);
164 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
165 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
166 continue;
167 if (dependsOnLocalPhi(L, Br->getCondition())) {
169 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
170 << " for loop:\n"
171 << *L << " due to " << *Br << '\n');
172 if (UP.Threshold >= MaxBoost)
173 return;
174 }
175 }
176 continue;
177 }
178
179 const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
180 if (!GEP)
181 continue;
182
183 unsigned AS = GEP->getAddressSpace();
184 unsigned Threshold = 0;
186 Threshold = ThresholdPrivate;
188 Threshold = ThresholdLocal;
189 else
190 continue;
191
192 if (UP.Threshold >= Threshold)
193 continue;
194
195 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
196 const Value *Ptr = GEP->getPointerOperand();
197 const AllocaInst *Alloca =
198 dyn_cast<AllocaInst>(getUnderlyingObject(Ptr));
199 if (!Alloca || !Alloca->isStaticAlloca())
200 continue;
201 Type *Ty = Alloca->getAllocatedType();
202 unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
203 if (AllocaSize > MaxAlloca)
204 continue;
205 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
207 LocalGEPsSeen++;
208 // Inhibit unroll for local memory if we have seen addressing not to
209 // a variable, most likely we will be unable to combine it.
210 // Do not unroll too deep inner loops for local memory to give a chance
211 // to unroll an outer loop for a more important reason.
212 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
213 (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
214 !isa<Argument>(GEP->getPointerOperand())))
215 continue;
216 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
217 << *L << " due to LDS use.\n");
219 }
220
221 // Check if GEP depends on a value defined by this loop itself.
222 bool HasLoopDef = false;
223 for (const Value *Op : GEP->operands()) {
224 const Instruction *Inst = dyn_cast<Instruction>(Op);
225 if (!Inst || L->isLoopInvariant(Op))
226 continue;
227
228 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
229 return SubLoop->contains(Inst); }))
230 continue;
231 HasLoopDef = true;
232 break;
233 }
234 if (!HasLoopDef)
235 continue;
236
237 // We want to do whatever we can to limit the number of alloca
238 // instructions that make it through to the code generator. allocas
239 // require us to use indirect addressing, which is slow and prone to
240 // compiler bugs. If this loop does an address calculation on an
241 // alloca ptr, then we want to use a higher than normal loop unroll
242 // threshold. This will give SROA a better chance to eliminate these
243 // allocas.
244 //
245 // We also want to have more unrolling for local memory to let ds
246 // instructions with different offsets combine.
247 //
248 // Don't use the maximum allowed value here as it will make some
249 // programs way too big.
250 UP.Threshold = Threshold;
251 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
252 << " for loop:\n"
253 << *L << " due to " << *GEP << '\n');
254 if (UP.Threshold >= MaxBoost)
255 return;
256 }
257
258 // If we got a GEP in a small BB from inner loop then increase max trip
259 // count to analyze for better estimation cost in unroll
260 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
262 }
263}
264
268}
269
271 return 1024;
272}
273
274const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
275 // Codegen control options which don't matter.
276 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
277 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureFlatForGlobal,
278 AMDGPU::FeaturePromoteAlloca, AMDGPU::FeatureUnalignedScratchAccess,
279 AMDGPU::FeatureUnalignedAccessMode,
280
281 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
282
283 // Property of the kernel/environment which can't actually differ.
284 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
285 AMDGPU::FeatureTrapHandler,
286
287 // The default assumption needs to be ecc is enabled, but no directly
288 // exposed operations depend on it, so it can be safely inlined.
289 AMDGPU::FeatureSRAMECC,
290
291 // Perf-tuning features
292 AMDGPU::FeatureFastFMAF32, AMDGPU::HalfRate64Ops};
293
295 : BaseT(TM, F.getParent()->getDataLayout()),
296 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
297 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
298 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
300 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
301 HasFP64FP16Denormals =
302 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
303}
304
306 return !F || !ST->isSingleLaneExecution(*F);
307}
308
309unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
310 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
311 // registers. See getRegisterClassForType for the implementation.
312 // In this case vector registers are not vector in terms of
313 // VGPRs, but those which can hold multiple values.
314
315 // This is really the number of registers to fill when vectorizing /
316 // interleaving loops, so we lie to avoid trying to use all registers.
317 return 4;
318}
319
322 switch (K) {
324 return TypeSize::getFixed(32);
326 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
328 return TypeSize::getScalable(0);
329 }
330 llvm_unreachable("Unsupported register kind");
331}
332
334 return 32;
335}
336
337unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
338 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
339 return 32 * 4 / ElemWidth;
340 return (ElemWidth == 16 && ST->has16BitInsts()) ? 2
341 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
342 : 1;
343}
344
345unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
346 unsigned ChainSizeInBytes,
347 VectorType *VecTy) const {
348 unsigned VecRegBitWidth = VF * LoadSize;
349 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
350 // TODO: Support element-size less than 32bit?
351 return 128 / LoadSize;
352
353 return VF;
354}
355
356unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
357 unsigned ChainSizeInBytes,
358 VectorType *VecTy) const {
359 unsigned VecRegBitWidth = VF * StoreSize;
360 if (VecRegBitWidth > 128)
361 return 128 / StoreSize;
362
363 return VF;
364}
365
366unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
367 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
368 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
370 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
371 AddrSpace == AMDGPUAS::BUFFER_RESOURCE) {
372 return 512;
373 }
374
375 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
376 return 8 * ST->getMaxPrivateElementSize();
377
378 // Common to flat, global, local and region. Assume for unknown addrspace.
379 return 128;
380}
381
382bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
383 Align Alignment,
384 unsigned AddrSpace) const {
385 // We allow vectorization of flat stores, even though we may need to decompose
386 // them later if they may access private memory. We don't have enough context
387 // here, and legalization can handle it.
388 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
389 return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
390 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
391 }
392 return true;
393}
394
395bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
396 Align Alignment,
397 unsigned AddrSpace) const {
398 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
399}
400
401bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
402 Align Alignment,
403 unsigned AddrSpace) const {
404 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
405}
406
408 return 1024;
409}
410
411// FIXME: Really we would like to issue multiple 128-bit loads and stores per
412// iteration. Should we report a larger size and let it legalize?
413//
414// FIXME: Should we use narrower types for local/region, or account for when
415// unaligned access is legal?
416//
417// FIXME: This could use fine tuning and microbenchmarks.
419 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
420 unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
421 std::optional<uint32_t> AtomicElementSize) const {
422
423 if (AtomicElementSize)
424 return Type::getIntNTy(Context, *AtomicElementSize * 8);
425
426 unsigned MinAlign = std::min(SrcAlign, DestAlign);
427
428 // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
429 // hardware into byte accesses. If you assume all alignments are equally
430 // probable, it's more efficient on average to use short accesses for this
431 // case.
432 if (MinAlign == 2)
434
435 // Not all subtargets have 128-bit DS instructions, and we currently don't
436 // form them by default.
437 if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
438 SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
439 DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
440 DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
442 }
443
444 // Global memory works best with 16-byte accesses. Private memory will also
445 // hit this, although they'll be decomposed.
447}
448
450 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
451 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
452 unsigned SrcAlign, unsigned DestAlign,
453 std::optional<uint32_t> AtomicCpySize) const {
454 assert(RemainingBytes < 16);
455
456 if (AtomicCpySize)
458 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
459 DestAlign, AtomicCpySize);
460
461 unsigned MinAlign = std::min(SrcAlign, DestAlign);
462
463 if (MinAlign != 2) {
464 Type *I64Ty = Type::getInt64Ty(Context);
465 while (RemainingBytes >= 8) {
466 OpsOut.push_back(I64Ty);
467 RemainingBytes -= 8;
468 }
469
470 Type *I32Ty = Type::getInt32Ty(Context);
471 while (RemainingBytes >= 4) {
472 OpsOut.push_back(I32Ty);
473 RemainingBytes -= 4;
474 }
475 }
476
477 Type *I16Ty = Type::getInt16Ty(Context);
478 while (RemainingBytes >= 2) {
479 OpsOut.push_back(I16Ty);
480 RemainingBytes -= 2;
481 }
482
484 while (RemainingBytes) {
485 OpsOut.push_back(I8Ty);
486 --RemainingBytes;
487 }
488}
489
491 // Disable unrolling if the loop is not vectorized.
492 // TODO: Enable this again.
493 if (VF.isScalar())
494 return 1;
495
496 return 8;
497}
498
500 MemIntrinsicInfo &Info) const {
501 switch (Inst->getIntrinsicID()) {
502 case Intrinsic::amdgcn_ds_ordered_add:
503 case Intrinsic::amdgcn_ds_ordered_swap:
504 case Intrinsic::amdgcn_ds_fadd:
505 case Intrinsic::amdgcn_ds_fmin:
506 case Intrinsic::amdgcn_ds_fmax: {
507 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
508 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
509 if (!Ordering || !Volatile)
510 return false; // Invalid.
511
512 unsigned OrderingVal = Ordering->getZExtValue();
513 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
514 return false;
515
516 Info.PtrVal = Inst->getArgOperand(0);
517 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
518 Info.ReadMem = true;
519 Info.WriteMem = true;
520 Info.IsVolatile = !Volatile->isZero();
521 return true;
522 }
523 default:
524 return false;
525 }
526}
527
529 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
532 const Instruction *CxtI) {
533
534 // Legalize the type.
535 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
536 int ISD = TLI->InstructionOpcodeToISD(Opcode);
537
538 // Because we don't have any legal vector operations, but the legal types, we
539 // need to account for split vectors.
540 unsigned NElts = LT.second.isVector() ?
541 LT.second.getVectorNumElements() : 1;
542
543 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
544
545 switch (ISD) {
546 case ISD::SHL:
547 case ISD::SRL:
548 case ISD::SRA:
549 if (SLT == MVT::i64)
550 return get64BitInstrCost(CostKind) * LT.first * NElts;
551
552 if (ST->has16BitInsts() && SLT == MVT::i16)
553 NElts = (NElts + 1) / 2;
554
555 // i32
556 return getFullRateInstrCost() * LT.first * NElts;
557 case ISD::ADD:
558 case ISD::SUB:
559 case ISD::AND:
560 case ISD::OR:
561 case ISD::XOR:
562 if (SLT == MVT::i64) {
563 // and, or and xor are typically split into 2 VALU instructions.
564 return 2 * getFullRateInstrCost() * LT.first * NElts;
565 }
566
567 if (ST->has16BitInsts() && SLT == MVT::i16)
568 NElts = (NElts + 1) / 2;
569
570 return LT.first * NElts * getFullRateInstrCost();
571 case ISD::MUL: {
572 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
573 if (SLT == MVT::i64) {
574 const int FullRateCost = getFullRateInstrCost();
575 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
576 }
577
578 if (ST->has16BitInsts() && SLT == MVT::i16)
579 NElts = (NElts + 1) / 2;
580
581 // i32
582 return QuarterRateCost * NElts * LT.first;
583 }
584 case ISD::FMUL:
585 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
586 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
587 // fused operation.
588 if (CxtI && CxtI->hasOneUse())
589 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
590 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
591 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
592 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
594 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
596
597 // Estimate all types may be fused with contract/unsafe flags
599 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
600 Options.UnsafeFPMath ||
601 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
603 }
604 }
605 [[fallthrough]];
606 case ISD::FADD:
607 case ISD::FSUB:
608 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
609 NElts = (NElts + 1) / 2;
610 if (SLT == MVT::f64)
611 return LT.first * NElts * get64BitInstrCost(CostKind);
612
613 if (ST->has16BitInsts() && SLT == MVT::f16)
614 NElts = (NElts + 1) / 2;
615
616 if (SLT == MVT::f32 || SLT == MVT::f16)
617 return LT.first * NElts * getFullRateInstrCost();
618 break;
619 case ISD::FDIV:
620 case ISD::FREM:
621 // FIXME: frem should be handled separately. The fdiv in it is most of it,
622 // but the current lowering is also not entirely correct.
623 if (SLT == MVT::f64) {
624 int Cost = 7 * get64BitInstrCost(CostKind) +
625 getQuarterRateInstrCost(CostKind) +
626 3 * getHalfRateInstrCost(CostKind);
627 // Add cost of workaround.
629 Cost += 3 * getFullRateInstrCost();
630
631 return LT.first * Cost * NElts;
632 }
633
634 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
635 // TODO: This is more complicated, unsafe flags etc.
636 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
637 (SLT == MVT::f16 && ST->has16BitInsts())) {
638 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
639 }
640 }
641
642 if (SLT == MVT::f16 && ST->has16BitInsts()) {
643 // 2 x v_cvt_f32_f16
644 // f32 rcp
645 // f32 fmul
646 // v_cvt_f16_f32
647 // f16 div_fixup
648 int Cost =
649 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
650 return LT.first * Cost * NElts;
651 }
652
653 if (SLT == MVT::f32 || SLT == MVT::f16) {
654 // 4 more v_cvt_* insts without f16 insts support
655 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
656 1 * getQuarterRateInstrCost(CostKind);
657
658 if (!HasFP32Denormals) {
659 // FP mode switches.
660 Cost += 2 * getFullRateInstrCost();
661 }
662
663 return LT.first * NElts * Cost;
664 }
665 break;
666 case ISD::FNEG:
667 // Use the backend' estimation. If fneg is not free each element will cost
668 // one additional instruction.
669 return TLI->isFNegFree(SLT) ? 0 : NElts;
670 default:
671 break;
672 }
673
674 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
675 Args, CxtI);
676}
677
678// Return true if there's a potential benefit from using v2f16/v2i16
679// instructions for an intrinsic, even if it requires nontrivial legalization.
681 switch (ID) {
682 case Intrinsic::fma: // TODO: fmuladd
683 // There's a small benefit to using vector ops in the legalized code.
684 case Intrinsic::round:
685 case Intrinsic::uadd_sat:
686 case Intrinsic::usub_sat:
687 case Intrinsic::sadd_sat:
688 case Intrinsic::ssub_sat:
689 return true;
690 default:
691 return false;
692 }
693}
694
698 if (ICA.getID() == Intrinsic::fabs)
699 return 0;
700
703
704 Type *RetTy = ICA.getReturnType();
705
706 // Legalize the type.
707 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
708
709 unsigned NElts = LT.second.isVector() ?
710 LT.second.getVectorNumElements() : 1;
711
712 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
713
714 if (SLT == MVT::f64)
715 return LT.first * NElts * get64BitInstrCost(CostKind);
716
717 if ((ST->has16BitInsts() && SLT == MVT::f16) ||
718 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
719 NElts = (NElts + 1) / 2;
720
721 // TODO: Get more refined intrinsic costs?
722 unsigned InstRate = getQuarterRateInstrCost(CostKind);
723
724 switch (ICA.getID()) {
725 case Intrinsic::fma:
726 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
727 : getQuarterRateInstrCost(CostKind);
728 break;
729 case Intrinsic::uadd_sat:
730 case Intrinsic::usub_sat:
731 case Intrinsic::sadd_sat:
732 case Intrinsic::ssub_sat:
733 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
734 if (any_of(ValidSatTys, [&LT](MVT M) { return M == LT.second; }))
735 NElts = 1;
736 break;
737 }
738
739 return LT.first * NElts * InstRate;
740}
741
744 const Instruction *I) {
745 assert((I == nullptr || I->getOpcode() == Opcode) &&
746 "Opcode should reflect passed instruction.");
747 const bool SCost =
749 const int CBrCost = SCost ? 5 : 7;
750 switch (Opcode) {
751 case Instruction::Br: {
752 // Branch instruction takes about 4 slots on gfx900.
753 auto BI = dyn_cast_or_null<BranchInst>(I);
754 if (BI && BI->isUnconditional())
755 return SCost ? 1 : 4;
756 // Suppose conditional branch takes additional 3 exec manipulations
757 // instructions in average.
758 return CBrCost;
759 }
760 case Instruction::Switch: {
761 auto SI = dyn_cast_or_null<SwitchInst>(I);
762 // Each case (including default) takes 1 cmp + 1 cbr instructions in
763 // average.
764 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
765 }
766 case Instruction::Ret:
767 return SCost ? 1 : 10;
768 }
769 return BaseT::getCFInstrCost(Opcode, CostKind, I);
770}
771
774 std::optional<FastMathFlags> FMF,
777 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
778
779 EVT OrigTy = TLI->getValueType(DL, Ty);
780
781 // Computes cost on targets that have packed math instructions(which support
782 // 16-bit types only).
783 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
784 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
785
786 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
787 return LT.first * getFullRateInstrCost();
788}
789
792 FastMathFlags FMF,
794 EVT OrigTy = TLI->getValueType(DL, Ty);
795
796 // Computes cost on targets that have packed math instructions(which support
797 // 16-bit types only).
798 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
799 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
800
801 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
802 return LT.first * getHalfRateInstrCost(CostKind);
803}
804
807 unsigned Index, Value *Op0,
808 Value *Op1) {
809 switch (Opcode) {
810 case Instruction::ExtractElement:
811 case Instruction::InsertElement: {
812 unsigned EltSize
813 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
814 if (EltSize < 32) {
815 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
816 return 0;
817 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0,
818 Op1);
819 }
820
821 // Extracts are just reads of a subregister, so are free. Inserts are
822 // considered free because we don't want to have any cost for scalarizing
823 // operations, and we don't have to copy into a different register class.
824
825 // Dynamic indexing isn't free and is best avoided.
826 return Index == ~0u ? 2 : 0;
827 }
828 default:
829 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1);
830 }
831}
832
833/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
834/// this is analyzing the collective result of all output registers. Otherwise,
835/// this is only querying a specific result index if this returns multiple
836/// registers in a struct.
838 const CallInst *CI, ArrayRef<unsigned> Indices) const {
839 // TODO: Handle complex extract indices
840 if (Indices.size() > 1)
841 return true;
842
843 const DataLayout &DL = CI->getModule()->getDataLayout();
844 const SIRegisterInfo *TRI = ST->getRegisterInfo();
845 TargetLowering::AsmOperandInfoVector TargetConstraints =
846 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
847
848 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
849
850 int OutputIdx = 0;
851 for (auto &TC : TargetConstraints) {
852 if (TC.Type != InlineAsm::isOutput)
853 continue;
854
855 // Skip outputs we don't care about.
856 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
857 continue;
858
860
862 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
863
864 // For AGPR constraints null is returned on subtargets without AGPRs, so
865 // assume divergent for null.
866 if (!RC || !TRI->isSGPRClass(RC))
867 return true;
868 }
869
870 return false;
871}
872
874 const IntrinsicInst *ReadReg) const {
875 Metadata *MD =
876 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
878 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
879
880 // Special case registers that look like VCC.
881 MVT VT = MVT::getVT(ReadReg->getType());
882 if (VT == MVT::i1)
883 return true;
884
885 // Special case scalar registers that start with 'v'.
886 if (RegName.startswith("vcc") || RegName.empty())
887 return false;
888
889 // VGPR or AGPR is divergent. There aren't any specially named vector
890 // registers.
891 return RegName[0] == 'v' || RegName[0] == 'a';
892}
893
894/// \returns true if the result of the value could potentially be
895/// different across workitems in a wavefront.
897 if (const Argument *A = dyn_cast<Argument>(V))
899
900 // Loads from the private and flat address spaces are divergent, because
901 // threads can execute the load instruction with the same inputs and get
902 // different results.
903 //
904 // All other loads are not divergent, because if threads issue loads with the
905 // same arguments, they will always get the same result.
906 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
907 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
908 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
909
910 // Atomics are divergent because they are executed sequentially: when an
911 // atomic operation refers to the same address in each thread, then each
912 // thread after the first sees the value written by the previous thread as
913 // original value.
914 if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
915 return true;
916
917 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
918 if (Intrinsic->getIntrinsicID() == Intrinsic::read_register)
919 return isReadRegisterSourceOfDivergence(Intrinsic);
920
921 return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
922 }
923
924 // Assume all function calls are a source of divergence.
925 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
926 if (CI->isInlineAsm())
928 return true;
929 }
930
931 // Assume all function calls are a source of divergence.
932 if (isa<InvokeInst>(V))
933 return true;
934
935 return false;
936}
937
938bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
939 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
940 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
941
942 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
943 if (CI->isInlineAsm())
945 return false;
946 }
947
948 // In most cases TID / wavefrontsize is uniform.
949 //
950 // However, if a kernel has uneven dimesions we can have a value of
951 // workitem-id-x divided by the wavefrontsize non-uniform. For example
952 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
953 // packed into a same wave which gives 1 and 0 after the division by 64
954 // respectively.
955 //
956 // FIXME: limit it to 1D kernels only, although that shall be possible
957 // to perform this optimization is the size of the X dimension is a power
958 // of 2, we just do not currently have infrastructure to query it.
959 using namespace llvm::PatternMatch;
960 uint64_t C;
961 if (match(V, m_LShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
962 m_ConstantInt(C))) ||
963 match(V, m_AShr(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
964 m_ConstantInt(C)))) {
965 const Function *F = cast<Instruction>(V)->getFunction();
966 return C >= ST->getWavefrontSizeLog2() &&
967 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
968 }
969
970 Value *Mask;
971 if (match(V, m_c_And(m_Intrinsic<Intrinsic::amdgcn_workitem_id_x>(),
972 m_Value(Mask)))) {
973 const Function *F = cast<Instruction>(V)->getFunction();
974 const DataLayout &DL = F->getParent()->getDataLayout();
975 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
976 ST->getWavefrontSizeLog2() &&
977 ST->getMaxWorkitemID(*F, 1) == 0 && ST->getMaxWorkitemID(*F, 2) == 0;
978 }
979
980 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
981 if (!ExtValue)
982 return false;
983
984 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
985 if (!CI)
986 return false;
987
988 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
989 switch (Intrinsic->getIntrinsicID()) {
990 default:
991 return false;
992 case Intrinsic::amdgcn_if:
993 case Intrinsic::amdgcn_else: {
994 ArrayRef<unsigned> Indices = ExtValue->getIndices();
995 return Indices.size() == 1 && Indices[0] == 1;
996 }
997 }
998 }
999
1000 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1001 // divergent for the overall struct return. We need to override it in the
1002 // case we're extracting an SGPR component here.
1003 if (CI->isInlineAsm())
1004 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1005
1006 return false;
1007}
1008
1010 Intrinsic::ID IID) const {
1011 switch (IID) {
1012 case Intrinsic::amdgcn_ds_fadd:
1013 case Intrinsic::amdgcn_ds_fmin:
1014 case Intrinsic::amdgcn_ds_fmax:
1015 case Intrinsic::amdgcn_is_shared:
1016 case Intrinsic::amdgcn_is_private:
1017 case Intrinsic::amdgcn_flat_atomic_fadd:
1018 case Intrinsic::amdgcn_flat_atomic_fmax:
1019 case Intrinsic::amdgcn_flat_atomic_fmin:
1020 OpIndexes.push_back(0);
1021 return true;
1022 default:
1023 return false;
1024 }
1025}
1026
1028 Value *OldV,
1029 Value *NewV) const {
1030 auto IntrID = II->getIntrinsicID();
1031 switch (IntrID) {
1032 case Intrinsic::amdgcn_ds_fadd:
1033 case Intrinsic::amdgcn_ds_fmin:
1034 case Intrinsic::amdgcn_ds_fmax: {
1035 const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
1036 if (!IsVolatile->isZero())
1037 return nullptr;
1038 Module *M = II->getParent()->getParent()->getParent();
1039 Type *DestTy = II->getType();
1040 Type *SrcTy = NewV->getType();
1041 Function *NewDecl =
1042 Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
1043 II->setArgOperand(0, NewV);
1044 II->setCalledFunction(NewDecl);
1045 return II;
1046 }
1047 case Intrinsic::amdgcn_is_shared:
1048 case Intrinsic::amdgcn_is_private: {
1049 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1051 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1052 LLVMContext &Ctx = NewV->getType()->getContext();
1053 ConstantInt *NewVal = (TrueAS == NewAS) ?
1055 return NewVal;
1056 }
1057 case Intrinsic::ptrmask: {
1058 unsigned OldAS = OldV->getType()->getPointerAddressSpace();
1059 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1060 Value *MaskOp = II->getArgOperand(1);
1061 Type *MaskTy = MaskOp->getType();
1062
1063 bool DoTruncate = false;
1064
1065 const GCNTargetMachine &TM =
1066 static_cast<const GCNTargetMachine &>(getTLI()->getTargetMachine());
1067 if (!TM.isNoopAddrSpaceCast(OldAS, NewAS)) {
1068 // All valid 64-bit to 32-bit casts work by chopping off the high
1069 // bits. Any masking only clearing the low bits will also apply in the new
1070 // address space.
1071 if (DL.getPointerSizeInBits(OldAS) != 64 ||
1072 DL.getPointerSizeInBits(NewAS) != 32)
1073 return nullptr;
1074
1075 // TODO: Do we need to thread more context in here?
1076 KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
1077 if (Known.countMinLeadingOnes() < 32)
1078 return nullptr;
1079
1080 DoTruncate = true;
1081 }
1082
1083 IRBuilder<> B(II);
1084 if (DoTruncate) {
1085 MaskTy = B.getInt32Ty();
1086 MaskOp = B.CreateTrunc(MaskOp, MaskTy);
1087 }
1088
1089 return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
1090 {NewV, MaskOp});
1091 }
1092 case Intrinsic::amdgcn_flat_atomic_fadd:
1093 case Intrinsic::amdgcn_flat_atomic_fmax:
1094 case Intrinsic::amdgcn_flat_atomic_fmin: {
1095 Type *DestTy = II->getType();
1096 Type *SrcTy = NewV->getType();
1097 unsigned NewAS = SrcTy->getPointerAddressSpace();
1099 return nullptr;
1100 Module *M = II->getModule();
1102 {DestTy, SrcTy, DestTy});
1103 II->setArgOperand(0, NewV);
1104 II->setCalledFunction(NewDecl);
1105 return II;
1106 }
1107 default:
1108 return nullptr;
1109 }
1110}
1111
1113 VectorType *VT, ArrayRef<int> Mask,
1115 int Index, VectorType *SubTp,
1117 Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
1118
1119 if (ST->hasVOP3PInsts()) {
1120 if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
1121 DL.getTypeSizeInBits(VT->getElementType()) == 16) {
1122 // With op_sel VOP3P instructions freely can access the low half or high
1123 // half of a register, so any swizzle is free.
1124
1125 switch (Kind) {
1126 case TTI::SK_Broadcast:
1127 case TTI::SK_Reverse:
1129 return 0;
1130 default:
1131 break;
1132 }
1133 }
1134 }
1135
1136 return BaseT::getShuffleCost(Kind, VT, Mask, CostKind, Index, SubTp);
1137}
1138
1140 const Function *Callee) const {
1141 const TargetMachine &TM = getTLI()->getTargetMachine();
1142 const GCNSubtarget *CallerST
1143 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1144 const GCNSubtarget *CalleeST
1145 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1146
1147 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1148 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1149
1150 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1151 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1152 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1153 return false;
1154
1155 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1156 // no way to support merge for backend defined attributes.
1157 SIModeRegisterDefaults CallerMode(*Caller);
1158 SIModeRegisterDefaults CalleeMode(*Callee);
1159 if (!CallerMode.isInlineCompatible(CalleeMode))
1160 return false;
1161
1162 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1163 Callee->hasFnAttribute(Attribute::InlineHint))
1164 return true;
1165
1166 // Hack to make compile times reasonable.
1167 if (InlineMaxBB) {
1168 // Single BB does not increase total BB amount.
1169 if (Callee->size() == 1)
1170 return true;
1171 size_t BBSize = Caller->size() + Callee->size() - 1;
1172 return BBSize <= InlineMaxBB;
1173 }
1174
1175 return true;
1176}
1177
1179 const SITargetLowering *TLI,
1180 const GCNTTIImpl *TTIImpl) {
1181 const int NrOfSGPRUntilSpill = 26;
1182 const int NrOfVGPRUntilSpill = 32;
1183
1184 const DataLayout &DL = TTIImpl->getDataLayout();
1185
1186 unsigned adjustThreshold = 0;
1187 int SGPRsInUse = 0;
1188 int VGPRsInUse = 0;
1189 for (const Use &A : CB->args()) {
1190 SmallVector<EVT, 4> ValueVTs;
1191 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1192 for (auto ArgVT : ValueVTs) {
1193 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1194 CB->getContext(), CB->getCallingConv(), ArgVT);
1196 SGPRsInUse += CCRegNum;
1197 else
1198 VGPRsInUse += CCRegNum;
1199 }
1200 }
1201
1202 // The cost of passing function arguments through the stack:
1203 // 1 instruction to put a function argument on the stack in the caller.
1204 // 1 instruction to take a function argument from the stack in callee.
1205 // 1 instruction is explicitly take care of data dependencies in callee
1206 // function.
1207 InstructionCost ArgStackCost(1);
1208 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1209 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1211 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1212 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1214
1215 // The penalty cost is computed relative to the cost of instructions and does
1216 // not model any storage costs.
1217 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1218 *ArgStackCost.getValue() * InlineConstants::getInstrCost();
1219 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1220 *ArgStackCost.getValue() * InlineConstants::getInstrCost();
1221 return adjustThreshold;
1222}
1223
1224static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1225 const DataLayout &DL) {
1226 // If we have a pointer to a private array passed into a function
1227 // it will not be optimized out, leaving scratch usage.
1228 // This function calculates the total size in bytes of the memory that would
1229 // end in scratch if the call was not inlined.
1230 unsigned AllocaSize = 0;
1232 for (Value *PtrArg : CB->args()) {
1233 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1234 if (!Ty)
1235 continue;
1236
1237 unsigned AddrSpace = Ty->getAddressSpace();
1238 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1239 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1240 continue;
1241
1242 const AllocaInst *AI = dyn_cast<AllocaInst>(getUnderlyingObject(PtrArg));
1243 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1244 continue;
1245
1246 AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
1247 }
1248 return AllocaSize;
1249}
1250
1252 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
1253
1254 // Private object passed as arguments may end up in scratch usage if the call
1255 // is not inlined. Increase the inline threshold to promote inlining.
1256 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1257 if (AllocaSize > 0)
1258 Threshold += ArgAllocaCost;
1259 return Threshold;
1260}
1261
1263 const AllocaInst *AI) const {
1264
1265 // Below the cutoff, assume that the private memory objects would be
1266 // optimized
1267 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1268 if (AllocaSize <= ArgAllocaCutoff)
1269 return 0;
1270
1271 // Above the cutoff, we give a cost to each private memory object
1272 // depending its size. If the array can be optimized by SROA this cost is not
1273 // added to the total-cost in the inliner cost analysis.
1274 //
1275 // We choose the total cost of the alloca such that their sum cancels the
1276 // bonus given in the threshold (ArgAllocaCost).
1277 //
1278 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1279 //
1280 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1281 // the single-bb bonus and the vector-bonus.
1282 //
1283 // We compensate the first two multipliers, by repeating logic from the
1284 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1285 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1286 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1287
1288 bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
1289 return BB.getTerminator()->getNumSuccessors() > 1;
1290 });
1291 if (SingleBB) {
1292 Threshold += Threshold / 2;
1293 }
1294
1295 auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());
1296
1297 // Attribute the bonus proportionally to the alloca size
1298 unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
1299
1300 return AllocaThresholdBonus;
1301}
1302
1306 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1307}
1308
1311 CommonTTI.getPeelingPreferences(L, SE, PP);
1312}
1313
1314int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1315 return ST->hasFullRate64Ops()
1316 ? getFullRateInstrCost()
1317 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1318 : getQuarterRateInstrCost(CostKind);
1319}
1320
1321std::pair<InstructionCost, MVT>
1322GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1323 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1324 auto Size = DL.getTypeSizeInBits(Ty);
1325 // Maximum load or store can handle 8 dwords for scalar and 4 for
1326 // vector ALU. Let's assume anything above 8 dwords is expensive
1327 // even if legal.
1328 if (Size <= 256)
1329 return Cost;
1330
1331 Cost.first += (Size + 255) / 256;
1332 return Cost;
1333}
aarch64 promote const
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
The AMDGPU TargetMachine interface definition for hw codegen targets.
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, const DataLayout &DL)
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
This file a TargetTransformInfo::Concept conforming object specific to the AMDGPU target machine.
static const Function * getParent(const Value *V)
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
Analysis containing CSE Info
Definition: CSEInfo.cpp:27
static cl::opt< TargetTransformInfo::TargetCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(TargetTransformInfo::TCK_RecipThroughput), cl::values(clEnumValN(TargetTransformInfo::TCK_RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(TargetTransformInfo::TCK_Latency, "latency", "Instruction latency"), clEnumValN(TargetTransformInfo::TCK_CodeSize, "code-size", "Code size"), clEnumValN(TargetTransformInfo::TCK_SizeAndLatency, "size-latency", "Code size and latency")))
return RetTy
#define LLVM_DEBUG(X)
Definition: Debug.h:101
uint64_t Size
Hexagon Common GEP
#define RegName(no)
static LVOptions Options
Definition: LVOptions.cpp:25
#define F(x, y, z)
Definition: MD5.cpp:55
#define I(x, y, z)
Definition: MD5.cpp:58
unsigned const TargetRegisterInfo * TRI
LLVMContext & Context
const char LLVMTargetMachineRef TM
const SmallVectorImpl< MachineOperand > & Cond
assert(ImpDefSCC.getReg()==AMDGPU::SCC &&ImpDefSCC.isDef())
bool hasMadMacF32Insts() const
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const
Return the maximum workitem ID value in the function, for the given (0, 1, 2) dimension.
unsigned getWavefrontSizeLog2() const
bool has16BitInsts() const
bool hasFastFMAF32() const
bool isSingleLaneExecution(const Function &Kernel) const
Return true if only a single workitem can be active in a wave.
bool hasVOP3PInsts() const
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
int64_t getMaxMemIntrinsicInlineSizeThreshold() const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
an instruction to allocate memory on the stack
Definition: Instructions.h:58
bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
Type * getAllocatedType() const
Return the type that is being allocated by the instruction.
Definition: Instructions.h:118
This class represents an incoming formal argument to a Function.
Definition: Argument.h:28
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition: ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition: ArrayRef.h:165
bool empty() const
empty - Check if the array is empty.
Definition: ArrayRef.h:160
LLVM Basic Block Representation.
Definition: BasicBlock.h:56
const Function * getParent() const
Return the enclosing method, or null if none.
Definition: BasicBlock.h:112
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
Get intrinsic cost based on arguments.
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
Definition: BasicTTIImpl.h:856
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *Ty, int &Index, VectorType *&SubTy) const
Definition: BasicTTIImpl.h:934
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
Try to calculate op costs for min/max reduction operations.
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
Definition: BasicTTIImpl.h:978
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
Definition: BasicTTIImpl.h:619
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
Estimate the cost of type-legalization and the legalized type.
Definition: BasicTTIImpl.h:820
Conditional or Unconditional Branch instruction.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Definition: InstrTypes.h:1190
bool isInlineAsm() const
Check if this call is an inline asm statement.
Definition: InstrTypes.h:1479
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Definition: InstrTypes.h:1412
CallingConv::ID getCallingConv() const
Definition: InstrTypes.h:1470
Value * getArgOperand(unsigned i) const
Definition: InstrTypes.h:1357
void setArgOperand(unsigned i, Value *v)
Definition: InstrTypes.h:1362
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
Definition: InstrTypes.h:1348
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
Definition: InstrTypes.h:1388
void setCalledFunction(Function *Fn)
Sets the function called, including updating the function type.
Definition: InstrTypes.h:1451
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
Definition: Constants.h:78
static ConstantInt * getTrue(LLVMContext &Context)
Definition: Constants.cpp:833
static ConstantInt * getFalse(LLVMContext &Context)
Definition: Constants.cpp:840
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition: Constants.h:151
This class represents an Operation in the Expression.
A parsed version of the target data layout string in and methods for querying it.
Definition: DataLayout.h:110
unsigned getPointerSizeInBits(unsigned AS=0) const
Layout pointer size, in bits FIXME: The defaults need to be removed once all of the backends/clients ...
Definition: DataLayout.h:410
TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
Definition: DataLayout.h:504
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition: DataLayout.h:672
constexpr bool isScalar() const
Exactly one element.
Definition: TypeSize.h:302
This instruction extracts a struct member or array element value from an aggregate value.
ArrayRef< unsigned > getIndices() const
Convenience struct for specifying and reasoning about fast-math flags.
Definition: FMF.h:20
Container class for subtarget features.
static FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition: Type.cpp:693
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
Definition: GCNSubtarget.h:440
const SIRegisterInfo * getRegisterInfo() const override
Definition: GCNSubtarget.h:247
bool hasUnalignedScratchAccess() const
Definition: GCNSubtarget.h:552
bool hasPackedFP32Ops() const
Definition: GCNSubtarget.h:923
bool hasFullRate64Ops() const
Definition: GCNSubtarget.h:342
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
Definition: GCNSubtarget.h:309
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind)
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, Value *Op0, Value *Op1)
bool isAlwaysUniform(const Value *V) const
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP)
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
int64_t getMaxMemIntrinsicInlineSizeThreshold() const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args=std::nullopt)
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind)
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
unsigned getNumberOfRegisters(unsigned RCID) const
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
unsigned getMaxInterleaveFactor(ElementCount VF)
unsigned getInliningThresholdMultiplier() const
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE)
unsigned getMinVectorRegisterBitWidth() const
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional< uint32_t > AtomicCpySize) const
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args=ArrayRef< const Value * >(), const Instruction *CxtI=nullptr)
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const
unsigned adjustInliningThreshold(const CallBase *CB) const
bool areInlineCompatible(const Function *Caller, const Function *Callee) const
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const
bool isSourceOfDivergence(const Value *V) const
bool hasBranchDivergence(const Function *F=nullptr) const
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional< uint32_t > AtomicElementSize) const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind)
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Definition: Instructions.h:940
Module * getParent()
Get the module that this global value is contained inside of...
Definition: GlobalValue.h:652
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition: IRBuilder.h:2625
std::optional< CostType > getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
const Module * getModule() const
Return the module owning the function this instruction belongs to or nullptr it the function does not...
Definition: Instruction.cpp:71
const BasicBlock * getParent() const
Definition: Instruction.h:90
bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
A wrapper class for inspecting calls to intrinsic functions.
Definition: IntrinsicInst.h:47
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
Definition: IntrinsicInst.h:54
This is an important class for using LLVM in a threaded context.
Definition: LLVMContext.h:67
An instruction for reading from memory.
Definition: Instructions.h:177
Represents a single loop in the control flow graph.
Definition: LoopInfo.h:47
Metadata node.
Definition: Metadata.h:950
Machine Value Type.
static MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Definition: ValueTypes.cpp:573
Root of the metadata hierarchy.
Definition: Metadata.h:61
A Module instance is used to store all the information related to an LLVM module.
Definition: Module.h:65
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition: Module.h:254
The optimization diagnostic interface.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
Definition: SmallPtrSet.h:366
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
Definition: SmallPtrSet.h:451
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
Definition: SmallVector.h:577
void push_back(const T &Elt)
Definition: SmallVector.h:416
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Definition: SmallVector.h:1200
StringRef - Represent a constant reference to a string, i.e.
Definition: StringRef.h:50
int InstructionOpcodeToISD(unsigned Opcode) const
Get the ISD node that corresponds to the Instruction class opcode.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
const TargetMachine & getTargetMachine() const
std::vector< AsmOperandInfo > AsmOperandInfoVector
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
Primary interface to the complete machine description for the target machine.
Definition: TargetMachine.h:78
TargetOptions Options
const DataLayout & getDataLayout() const
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign, std::optional< uint32_t > AtomicCpySize) const
TargetCostKind
The kind of cost model.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition: TypeSize.h:322
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition: TypeSize.h:325
The instances of the Type class are immutable: once they are created, they are never changed.
Definition: Type.h:45
unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static IntegerType * getIntNTy(LLVMContext &C, unsigned N)
unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isSized(SmallPtrSetImpl< Type * > *Visited=nullptr) const
Return true if it makes sense to take the size of this type.
Definition: Type.h:302
static IntegerType * getInt16Ty(LLVMContext &C)
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition: Type.h:129
static IntegerType * getInt8Ty(LLVMContext &C)
static IntegerType * getInt32Ty(LLVMContext &C)
static IntegerType * getInt64Ty(LLVMContext &C)
A Use represents the edge between a Value definition and its users.
Definition: Use.h:43
Value * getOperand(unsigned i) const
Definition: User.h:169
LLVM Value Representation.
Definition: Value.h:74
Type * getType() const
All values are typed, get the type of this value.
Definition: Value.h:255
user_iterator user_begin()
Definition: Value.h:397
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition: Value.h:434
LLVMContext & getContext() const
All values hold a context through their type.
Definition: Value.cpp:1069
Base class of all SIMD vector types.
Definition: DerivedTypes.h:400
Type * getElementType() const
Definition: DerivedTypes.h:433
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
Definition: AMDGPU.h:398
@ REGION_ADDRESS
Address space for region memory. (GDS)
Definition: AMDGPU.h:392
@ LOCAL_ADDRESS
Address space for local memory.
Definition: AMDGPU.h:395
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
Definition: AMDGPU.h:394
@ FLAT_ADDRESS
Address space for flat memory.
Definition: AMDGPU.h:390
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
Definition: AMDGPU.h:391
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
Definition: AMDGPU.h:400
@ PRIVATE_ADDRESS
Address space for private memory.
Definition: AMDGPU.h:396
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
Definition: AMDGPU.h:403
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
Definition: AMDGPU.h:452
@ C
The default llvm calling convention, compatible with C.
Definition: CallingConv.h:34
@ ADD
Simple integer binary arithmetic operators.
Definition: ISDOpcodes.h:239
@ FADD
Simple binary floating point operators.
Definition: ISDOpcodes.h:390
@ FNEG
Perform various unary floating-point operations inspired by libm.
Definition: ISDOpcodes.h:925
@ SHL
Shift and rotation operations.
Definition: ISDOpcodes.h:705
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition: ISDOpcodes.h:680
Function * getDeclaration(Module *M, ID id, ArrayRef< Type * > Tys=std::nullopt)
Create or insert an LLVM Function declaration for an intrinsic, and return it.
Definition: Function.cpp:1422
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
Definition: PatternMatch.h:49
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
Definition: PatternMatch.h:147
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
Definition: PatternMatch.h:826
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
Definition: PatternMatch.h:76
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
initializer< Ty > init(const Ty &Val)
Definition: CommandLine.h:445
This is an optimization pass for GlobalISel generic memory operations.
Definition: AddressRanges.h:18
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< TypeSize > *Offsets, TypeSize StartingOffset)
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition: Analysis.cpp:122
@ Length
Definition: DWP.cpp:440
const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=6)
This method strips off any GEP address adjustments and pointer casts from the specified value,...
MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
Definition: LoopInfo.cpp:1042
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1734
raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition: Debug.cpp:163
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition: STLExtras.h:1741
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ FAdd
Sum of floats.
void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, unsigned Depth=0, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
constexpr uint64_t MinAlign(uint64_t A, uint64_t B)
A and B are either alignments or offsets.
Definition: MathExtras.h:338
InstructionCost Cost
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition: Alignment.h:39
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition: ValueTypes.h:34
uint64_t getScalarSizeInBits() const
Definition: ValueTypes.h:363
unsigned countMinLeadingOnes() const
Returns the minimum number of leading one bits.
Definition: KnownBits.h:242
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Parameters that control the generic loop unrolling transformation.
unsigned Threshold
The cost threshold for the unrolled loop.
bool UnrollVectorizedLoop
Don't disable runtime unroll for the loops which were vectorized.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...