LLVM 23.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
18#include "AMDGPUSubtarget.h"
19#include "AMDGPUTargetMachine.h"
27#include "llvm/IR/Function.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include <optional>
33
34using namespace llvm;
35
36#define DEBUG_TYPE "AMDGPUtti"
37
39 "amdgpu-unroll-threshold-private",
40 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
41 cl::init(2700), cl::Hidden);
42
44 "amdgpu-unroll-threshold-local",
45 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
46 cl::init(1000), cl::Hidden);
47
49 "amdgpu-unroll-threshold-if",
50 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
51 cl::init(200), cl::Hidden);
52
54 "amdgpu-unroll-runtime-local",
55 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
56 cl::init(true), cl::Hidden);
57
59 "amdgpu-unroll-max-block-to-analyze",
60 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
61 cl::init(32), cl::Hidden);
62
63static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
64 cl::Hidden, cl::init(4000),
65 cl::desc("Cost of alloca argument"));
66
67// If the amount of scratch memory to eliminate exceeds our ability to allocate
68// it into registers we gain nothing by aggressively inlining functions for that
69// heuristic.
71 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
72 cl::init(256),
73 cl::desc("Maximum alloca size to use for inline cost"));
74
75// Inliner constraint to achieve reasonable compilation time.
77 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
78 cl::desc("Maximum number of BBs allowed in a function after inlining"
79 " (compile time constraint)"));
80
81// This default unroll factor is based on microbenchmarks on gfx1030.
83 "amdgpu-memcpy-loop-unroll",
84 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
85 "operations when lowering statically-sized memcpy, memmove, or"
86 "memset as a loop"),
87 cl::init(16), cl::Hidden);
88
89static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
90 unsigned Depth = 0) {
92 if (!I)
93 return false;
94
95 for (const Value *V : I->operand_values()) {
96 if (!L->contains(I))
97 continue;
98 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
99 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
100 return SubLoop->contains(PHI); }))
101 return true;
102 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
103 return true;
104 }
105 return false;
106}
107
109 : BaseT(TM, F.getDataLayout()),
110 TargetTriple(TM->getTargetTriple()),
111 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
112 TLI(ST->getTargetLowering()) {}
113
116 OptimizationRemarkEmitter *ORE) const {
117 const Function &F = *L->getHeader()->getParent();
118 UP.Threshold =
119 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
120 UP.MaxCount = std::numeric_limits<unsigned>::max();
121 UP.Partial = true;
122
123 // Conditional branch in a loop back edge needs 3 additional exec
124 // manipulations in average.
125 UP.BEInsns += 3;
126
127 // We want to run unroll even for the loops which have been vectorized.
128 UP.UnrollVectorizedLoop = true;
129
130 // Enable runtime unrolling for loops whose trip count is not known at
131 // compile time. Use a reduced PartialThreshold to limit code-size growth.
132 UP.Runtime = true;
133 UP.PartialThreshold = UP.Threshold / 4;
134
135 // Maximum alloca size than can fit registers. Reserve 16 registers.
136 const unsigned MaxAlloca = (256 - 16) * 4;
137 unsigned ThresholdPrivate = UnrollThresholdPrivate;
138 unsigned ThresholdLocal = UnrollThresholdLocal;
139
140 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
141 // provided threshold value as the default for Threshold
142 if (MDNode *LoopUnrollThreshold =
143 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
144 if (LoopUnrollThreshold->getNumOperands() == 2) {
146 LoopUnrollThreshold->getOperand(1));
147 if (MetaThresholdValue) {
148 // We will also use the supplied value for PartialThreshold for now.
149 // We may introduce additional metadata if it becomes necessary in the
150 // future.
151 UP.Threshold = MetaThresholdValue->getSExtValue();
153 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
154 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
155 }
156 }
157 }
158
159 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
160 for (const BasicBlock *BB : L->getBlocks()) {
161 const DataLayout &DL = BB->getDataLayout();
162 unsigned LocalGEPsSeen = 0;
163
164 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
165 return SubLoop->contains(BB); }))
166 continue; // Block belongs to an inner loop.
167
168 for (const Instruction &I : *BB) {
169 // Unroll a loop which contains an "if" statement whose condition
170 // defined by a PHI belonging to the loop. This may help to eliminate
171 // if region and potentially even PHI itself, saving on both divergence
172 // and registers used for the PHI.
173 // Add a small bonus for each of such "if" statements.
174 if (const CondBrInst *Br = dyn_cast<CondBrInst>(&I)) {
175 if (UP.Threshold < MaxBoost) {
176 BasicBlock *Succ0 = Br->getSuccessor(0);
177 BasicBlock *Succ1 = Br->getSuccessor(1);
178 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
179 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
180 continue;
181 if (dependsOnLocalPhi(L, Br->getCondition())) {
183 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
184 << " for loop:\n"
185 << *L << " due to " << *Br << '\n');
186 if (UP.Threshold >= MaxBoost)
187 return;
188 }
189 }
190 continue;
191 }
192
194 if (!GEP)
195 continue;
196
197 unsigned AS = GEP->getAddressSpace();
198 unsigned Threshold = 0;
200 Threshold = ThresholdPrivate;
202 Threshold = ThresholdLocal;
203 else
204 continue;
205
206 if (UP.Threshold >= Threshold)
207 continue;
208
209 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
210 const Value *Ptr = GEP->getPointerOperand();
211 const AllocaInst *Alloca =
213 if (!Alloca || !Alloca->isStaticAlloca())
214 continue;
215 auto AllocaSize = Alloca->getAllocationSize(DL);
216 if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca)
217 continue;
218 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
220 LocalGEPsSeen++;
221 // Inhibit unroll for local memory if we have seen addressing not to
222 // a variable, most likely we will be unable to combine it.
223 // Do not unroll too deep inner loops for local memory to give a chance
224 // to unroll an outer loop for a more important reason.
225 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
226 (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
227 !isa<Argument>(GEP->getPointerOperand())))
228 continue;
229 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
230 << *L << " due to LDS use.\n");
232 }
233
234 // Check if GEP depends on a value defined by this loop itself.
235 bool HasLoopDef = false;
236 for (const Value *Op : GEP->operands()) {
237 const Instruction *Inst = dyn_cast<Instruction>(Op);
238 if (!Inst || L->isLoopInvariant(Op))
239 continue;
240
241 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
242 return SubLoop->contains(Inst); }))
243 continue;
244 HasLoopDef = true;
245 break;
246 }
247 if (!HasLoopDef)
248 continue;
249
250 // We want to do whatever we can to limit the number of alloca
251 // instructions that make it through to the code generator. allocas
252 // require us to use indirect addressing, which is slow and prone to
253 // compiler bugs. If this loop does an address calculation on an
254 // alloca ptr, then we want to use a higher than normal loop unroll
255 // threshold. This will give SROA a better chance to eliminate these
256 // allocas.
257 //
258 // We also want to have more unrolling for local memory to let ds
259 // instructions with different offsets combine.
260 //
261 // Don't use the maximum allowed value here as it will make some
262 // programs way too big.
263 UP.Threshold = Threshold;
264 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
265 << " for loop:\n"
266 << *L << " due to " << *GEP << '\n');
267 if (UP.Threshold >= MaxBoost)
268 return;
269 }
270
271 // If we got a GEP in a small BB from inner loop then increase max trip
272 // count to analyze for better estimation cost in unroll
273 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
275 }
276}
277
282
286
287const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
288 // Codegen control options which don't matter.
289 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
290 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal,
291 AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureUnalignedAccessMode,
292
293 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
294
295 // Property of the kernel/environment which can't actually differ.
296 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
297 AMDGPU::FeatureTrapHandler,
298
299 // The default assumption needs to be ecc is enabled, but no directly
300 // exposed operations depend on it, so it can be safely inlined.
301 AMDGPU::FeatureSRAMECC,
302
303 // Perf-tuning features
304 AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops};
305
307 : BaseT(TM, F.getDataLayout()),
308 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
309 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
310 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
312 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
313 HasFP64FP16Denormals =
314 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
315}
316
318 return !F || !ST->isSingleLaneExecution(*F);
319}
320
321unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
322 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
323 // registers. See getRegisterClassForType for the implementation.
324 // In this case vector registers are not vector in terms of
325 // VGPRs, but those which can hold multiple values.
326
327 // This is really the number of registers to fill when vectorizing /
328 // interleaving loops, so we lie to avoid trying to use all registers.
329 return 4;
330}
331
334 switch (K) {
336 return TypeSize::getFixed(32);
338 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
340 return TypeSize::getScalable(0);
341 }
342 llvm_unreachable("Unsupported register kind");
343}
344
346 return 32;
347}
348
349unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
350 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
351 return 32 * 4 / ElemWidth;
352 // For a given width return the max 0number of elements that can be combined
353 // into a wider bit value:
354 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
355 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
356 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
357 : 1;
358}
359
360unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
361 unsigned ChainSizeInBytes,
362 VectorType *VecTy) const {
363 unsigned VecRegBitWidth = VF * LoadSize;
364 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
365 // TODO: Support element-size less than 32bit?
366 return 128 / LoadSize;
367
368 return VF;
369}
370
371unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
372 unsigned ChainSizeInBytes,
373 VectorType *VecTy) const {
374 unsigned VecRegBitWidth = VF * StoreSize;
375 if (VecRegBitWidth > 128)
376 return 128 / StoreSize;
377
378 return VF;
379}
380
381unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
382 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
383 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
385 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
386 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
388 return 512;
389 }
390
391 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
392 return 8 * ST->getMaxPrivateElementSize();
393
394 // Common to flat, global, local and region. Assume for unknown addrspace.
395 return 128;
396}
397
398bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
399 Align Alignment,
400 unsigned AddrSpace) const {
401 // We allow vectorization of flat stores, even though we may need to decompose
402 // them later if they may access private memory. We don't have enough context
403 // here, and legalization can handle it.
404 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
405 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
406 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
407 }
408 return true;
409}
410
411bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
412 Align Alignment,
413 unsigned AddrSpace) const {
414 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
415}
416
417bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
418 Align Alignment,
419 unsigned AddrSpace) const {
420 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
421}
422
426
428 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
429 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
430 std::optional<uint32_t> AtomicElementSize) const {
431
432 if (AtomicElementSize)
433 return Type::getIntNTy(Context, *AtomicElementSize * 8);
434
435 // 16-byte accesses achieve the highest copy throughput.
436 // If the operation has a fixed known length that is large enough, it is
437 // worthwhile to return an even wider type and let legalization lower it into
438 // multiple accesses, effectively unrolling the memcpy loop.
439 // We also rely on legalization to decompose into smaller accesses for
440 // subtargets and address spaces where it is necessary.
441 //
442 // Don't unroll if Length is not a constant, since unrolling leads to worse
443 // performance for length values that are smaller or slightly larger than the
444 // total size of the type returned here. Mitigating that would require a more
445 // complex lowering for variable-length memcpy and memmove.
446 unsigned I32EltsInVector = 4;
449 MemcpyLoopUnroll * I32EltsInVector);
450
451 return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector);
452}
453
455 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
456 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
457 Align SrcAlign, Align DestAlign,
458 std::optional<uint32_t> AtomicCpySize) const {
459
460 if (AtomicCpySize)
462 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
463 DestAlign, AtomicCpySize);
464
465 Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
466 while (RemainingBytes >= 16) {
467 OpsOut.push_back(I32x4Ty);
468 RemainingBytes -= 16;
469 }
470
471 Type *I64Ty = Type::getInt64Ty(Context);
472 while (RemainingBytes >= 8) {
473 OpsOut.push_back(I64Ty);
474 RemainingBytes -= 8;
475 }
476
477 Type *I32Ty = Type::getInt32Ty(Context);
478 while (RemainingBytes >= 4) {
479 OpsOut.push_back(I32Ty);
480 RemainingBytes -= 4;
481 }
482
483 Type *I16Ty = Type::getInt16Ty(Context);
484 while (RemainingBytes >= 2) {
485 OpsOut.push_back(I16Ty);
486 RemainingBytes -= 2;
487 }
488
489 Type *I8Ty = Type::getInt8Ty(Context);
490 while (RemainingBytes) {
491 OpsOut.push_back(I8Ty);
492 --RemainingBytes;
493 }
494}
495
497 // Disable unrolling if the loop is not vectorized.
498 // TODO: Enable this again.
499 if (VF.isScalar())
500 return 1;
501
502 return 8;
503}
504
506 MemIntrinsicInfo &Info) const {
507 switch (Inst->getIntrinsicID()) {
508 case Intrinsic::amdgcn_ds_ordered_add:
509 case Intrinsic::amdgcn_ds_ordered_swap: {
510 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
511 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
512 if (!Ordering || !Volatile)
513 return false; // Invalid.
514
515 unsigned OrderingVal = Ordering->getZExtValue();
516 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
517 return false;
518
519 Info.PtrVal = Inst->getArgOperand(0);
520 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
521 Info.ReadMem = true;
522 Info.WriteMem = true;
523 Info.IsVolatile = !Volatile->isZero();
524 return true;
525 }
526 default:
527 return false;
528 }
529}
530
532 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
534 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
535
536 // Legalize the type.
537 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
538 int ISD = TLI->InstructionOpcodeToISD(Opcode);
539
540 // Because we don't have any legal vector operations, but the legal types, we
541 // need to account for split vectors.
542 unsigned NElts = LT.second.isVector() ?
543 LT.second.getVectorNumElements() : 1;
544
545 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
546
547 switch (ISD) {
548 case ISD::SHL:
549 case ISD::SRL:
550 case ISD::SRA:
551 if (SLT == MVT::i64)
552 return get64BitInstrCost(CostKind) * LT.first * NElts;
553
554 if (ST->has16BitInsts() && SLT == MVT::i16)
555 NElts = (NElts + 1) / 2;
556
557 // i32
558 return getFullRateInstrCost() * LT.first * NElts;
559 case ISD::ADD:
560 case ISD::SUB:
561 case ISD::AND:
562 case ISD::OR:
563 case ISD::XOR:
564 if (SLT == MVT::i64) {
565 // and, or and xor are typically split into 2 VALU instructions.
566 return 2 * getFullRateInstrCost() * LT.first * NElts;
567 }
568
569 if (ST->has16BitInsts() && SLT == MVT::i16)
570 NElts = (NElts + 1) / 2;
571
572 return LT.first * NElts * getFullRateInstrCost();
573 case ISD::MUL: {
574 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
575 if (SLT == MVT::i64) {
576 const int FullRateCost = getFullRateInstrCost();
577 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
578 }
579
580 if (ST->has16BitInsts() && SLT == MVT::i16)
581 NElts = (NElts + 1) / 2;
582
583 // i32
584 return QuarterRateCost * NElts * LT.first;
585 }
586 case ISD::FMUL:
587 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
588 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
589 // fused operation.
590 if (CxtI && CxtI->hasOneUse())
591 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
592 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
593 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
594 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
596 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
598
599 // Estimate all types may be fused with contract/unsafe flags
600 const TargetOptions &Options = TLI->getTargetMachine().Options;
601 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
602 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
604 }
605 }
606 [[fallthrough]];
607 case ISD::FADD:
608 case ISD::FSUB:
609 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
610 NElts = (NElts + 1) / 2;
611 if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
612 NElts = (NElts + 1) / 2;
613 if (SLT == MVT::f64)
614 return LT.first * NElts * get64BitInstrCost(CostKind);
615
616 if (ST->has16BitInsts() && SLT == MVT::f16)
617 NElts = (NElts + 1) / 2;
618
619 if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
620 return LT.first * NElts * getFullRateInstrCost();
621 break;
622 case ISD::FDIV:
623 case ISD::FREM:
624 // FIXME: frem should be handled separately. The fdiv in it is most of it,
625 // but the current lowering is also not entirely correct.
626 if (SLT == MVT::f64) {
627 int Cost = 7 * get64BitInstrCost(CostKind) +
628 getQuarterRateInstrCost(CostKind) +
629 3 * getHalfRateInstrCost(CostKind);
630 // Add cost of workaround.
631 if (!ST->hasUsableDivScaleConditionOutput())
632 Cost += 3 * getFullRateInstrCost();
633
634 return LT.first * Cost * NElts;
635 }
636
637 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
638 // TODO: This is more complicated, unsafe flags etc.
639 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
640 (SLT == MVT::f16 && ST->has16BitInsts())) {
641 return LT.first * getTransInstrCost(CostKind) * NElts;
642 }
643 }
644
645 if (SLT == MVT::f16 && ST->has16BitInsts()) {
646 // 2 x v_cvt_f32_f16
647 // f32 rcp
648 // f32 fmul
649 // v_cvt_f16_f32
650 // f16 div_fixup
651 int Cost = 4 * getFullRateInstrCost() + 2 * getTransInstrCost(CostKind);
652 return LT.first * Cost * NElts;
653 }
654
655 if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
656 // Fast unsafe fdiv lowering:
657 // f32 rcp
658 // f32 fmul
659 int Cost = getTransInstrCost(CostKind) + getFullRateInstrCost();
660 return LT.first * Cost * NElts;
661 }
662
663 if (SLT == MVT::f32 || SLT == MVT::f16) {
664 // 4 more v_cvt_* insts without f16 insts support
665 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
666 1 * getTransInstrCost(CostKind);
667
668 if (!HasFP32Denormals) {
669 // FP mode switches.
670 Cost += 2 * getFullRateInstrCost();
671 }
672
673 return LT.first * NElts * Cost;
674 }
675 break;
676 case ISD::FNEG:
677 // Use the backend' estimation. If fneg is not free each element will cost
678 // one additional instruction.
679 return TLI->isFNegFree(SLT) ? 0 : NElts;
680 default:
681 break;
682 }
683
684 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
685 Args, CxtI);
686}
687
688// Return true if there's a potential benefit from using v2f16/v2i16
689// instructions for an intrinsic, even if it requires nontrivial legalization.
691 switch (ID) {
692 case Intrinsic::fma:
693 case Intrinsic::fmuladd:
694 case Intrinsic::copysign:
695 case Intrinsic::minimumnum:
696 case Intrinsic::maximumnum:
697 case Intrinsic::canonicalize:
698 // There's a small benefit to using vector ops in the legalized code.
699 case Intrinsic::round:
700 case Intrinsic::uadd_sat:
701 case Intrinsic::usub_sat:
702 case Intrinsic::sadd_sat:
703 case Intrinsic::ssub_sat:
704 case Intrinsic::abs:
705 return true;
706 default:
707 return false;
708 }
709}
710
714 switch (ICA.getID()) {
715 case Intrinsic::fabs:
716 // Free source modifier in the common case.
717 return 0;
718 case Intrinsic::amdgcn_workitem_id_x:
719 case Intrinsic::amdgcn_workitem_id_y:
720 case Intrinsic::amdgcn_workitem_id_z:
721 // TODO: If hasPackedTID, or if the calling context is not an entry point
722 // there may be a bit instruction.
723 return 0;
724 case Intrinsic::amdgcn_workgroup_id_x:
725 case Intrinsic::amdgcn_workgroup_id_y:
726 case Intrinsic::amdgcn_workgroup_id_z:
727 case Intrinsic::amdgcn_lds_kernel_id:
728 case Intrinsic::amdgcn_dispatch_ptr:
729 case Intrinsic::amdgcn_dispatch_id:
730 case Intrinsic::amdgcn_implicitarg_ptr:
731 case Intrinsic::amdgcn_queue_ptr:
732 // Read from an argument register.
733 return 0;
734 default:
735 break;
736 }
737
738 Type *RetTy = ICA.getReturnType();
739
740 Intrinsic::ID IID = ICA.getID();
741 switch (IID) {
742 case Intrinsic::exp:
743 case Intrinsic::exp2:
744 case Intrinsic::exp10: {
745 // Legalize the type.
746 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
747 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
748 unsigned NElts =
749 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
750
751 if (SLT == MVT::f64) {
752 unsigned NumOps = 20;
753 if (IID == Intrinsic::exp)
754 ++NumOps;
755 else if (IID == Intrinsic::exp10)
756 NumOps += 3;
757
758 return LT.first * NElts * NumOps * get64BitInstrCost(CostKind);
759 }
760
761 if (SLT == MVT::f32) {
762 unsigned NumFullRateOps = 0;
763 // v_exp_f32 (transcendental).
764 unsigned NumTransOps = 1;
765
766 if (!ICA.getFlags().approxFunc() && IID != Intrinsic::exp2) {
767 // Non-AFN exp/exp10: range reduction + v_exp_f32 + ldexp +
768 // overflow/underflow checks (lowerFEXP). Denorm is also handled.
769 // FMA preamble: ~13 full-rate ops; non-FMA: ~17.
770 NumFullRateOps = ST->hasFastFMAF32() ? 13 : 17;
771 } else {
772 if (IID == Intrinsic::exp) {
773 // lowerFEXPUnsafe: fmul (base conversion) + v_exp_f32.
774 NumFullRateOps = 1;
775 } else if (IID == Intrinsic::exp10) {
776 // lowerFEXP10Unsafe: 3 fmul + 2 v_exp_f32 (double-exp2).
777 NumFullRateOps = 3;
778 NumTransOps = 2;
779 }
780 // Denorm scaling adds setcc + select + fadd + select + fmul.
781 if (HasFP32Denormals)
782 NumFullRateOps += 5;
783 }
784
785 InstructionCost Cost = NumFullRateOps * getFullRateInstrCost() +
786 NumTransOps * getTransInstrCost(CostKind);
787 return LT.first * NElts * Cost;
788 }
789
790 break;
791 }
792 case Intrinsic::log:
793 case Intrinsic::log2:
794 case Intrinsic::log10: {
795 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
796 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
797 unsigned NElts =
798 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
799
800 if (SLT == MVT::f32) {
801 unsigned NumFullRateOps = 0;
802
803 if (IID == Intrinsic::log2) {
804 // LowerFLOG2: just v_log_f32.
805 } else if (ICA.getFlags().approxFunc()) {
806 // LowerFLOGUnsafe: v_log_f32 + fmul (base conversion).
807 NumFullRateOps = 1;
808 } else {
809 // LowerFLOGCommon non-AFN: v_log_f32 + extended-precision
810 // multiply + finite check.
811 NumFullRateOps = ST->hasFastFMAF32() ? 8 : 11;
812 }
813
814 if (HasFP32Denormals)
815 NumFullRateOps += 5;
816
818 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
819 return LT.first * NElts * Cost;
820 }
821
822 break;
823 }
824 case Intrinsic::sin:
825 case Intrinsic::cos: {
826 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
827 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
828 unsigned NElts =
829 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
830
831 if (SLT == MVT::f32) {
832 // LowerTrig: fmul(1/2pi) + v_sin/v_cos.
833 unsigned NumFullRateOps = ST->hasTrigReducedRange() ? 2 : 1;
834
836 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
837 return LT.first * NElts * Cost;
838 }
839
840 break;
841 }
842 case Intrinsic::sqrt: {
843 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
844 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
845 unsigned NElts =
846 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
847
848 if (SLT == MVT::f32) {
849 unsigned NumFullRateOps = 0;
850
851 if (!ICA.getFlags().approxFunc()) {
852 // lowerFSQRTF32 non-AFN: v_sqrt_f32 + refinement + scale fixup.
853 NumFullRateOps = HasFP32Denormals ? 17 : 16;
854 }
855
857 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
858 return LT.first * NElts * Cost;
859 }
860
861 break;
862 }
863 default:
864 break;
865 }
866
869
870 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
871 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
872 unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
873
874 if ((ST->hasVOP3PInsts() &&
875 (SLT == MVT::f16 || SLT == MVT::i16 ||
876 (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
877 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
878 NElts = (NElts + 1) / 2;
879
880 // TODO: Get more refined intrinsic costs?
881 unsigned InstRate = getQuarterRateInstrCost(CostKind);
882
883 switch (ICA.getID()) {
884 case Intrinsic::fma:
885 case Intrinsic::fmuladd:
886 if (SLT == MVT::f64) {
887 InstRate = get64BitInstrCost(CostKind);
888 break;
889 }
890
891 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
892 InstRate = getFullRateInstrCost();
893 else {
894 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
895 : getQuarterRateInstrCost(CostKind);
896 }
897 break;
898 case Intrinsic::copysign:
899 return NElts * getFullRateInstrCost();
900 case Intrinsic::minimumnum:
901 case Intrinsic::maximumnum: {
902 // Instruction + 2 canonicalizes. For cases that need type promotion, we the
903 // promotion takes the place of the canonicalize.
904 unsigned NumOps = 3;
905 if (const IntrinsicInst *II = ICA.getInst()) {
906 // Directly legal with ieee=0
907 // TODO: Not directly legal with strictfp
909 NumOps = 1;
910 }
911
912 unsigned BaseRate =
913 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
914 InstRate = BaseRate * NumOps;
915 break;
916 }
917 case Intrinsic::canonicalize: {
918 InstRate =
919 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
920 break;
921 }
922 case Intrinsic::uadd_sat:
923 case Intrinsic::usub_sat:
924 case Intrinsic::sadd_sat:
925 case Intrinsic::ssub_sat: {
926 if (SLT == MVT::i16 || SLT == MVT::i32)
927 InstRate = getFullRateInstrCost();
928
929 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
930 if (any_of(ValidSatTys, equal_to(LT.second)))
931 NElts = 1;
932 break;
933 }
934 case Intrinsic::abs:
935 // Expansion takes 2 instructions for VALU
936 if (SLT == MVT::i16 || SLT == MVT::i32)
937 InstRate = 2 * getFullRateInstrCost();
938 break;
939 default:
940 break;
941 }
942
943 return LT.first * NElts * InstRate;
944}
945
948 const Instruction *I) const {
949 assert((I == nullptr || I->getOpcode() == Opcode) &&
950 "Opcode should reflect passed instruction.");
951 const bool SCost =
953 const int CBrCost = SCost ? 5 : 7;
954 switch (Opcode) {
955 case Instruction::UncondBr:
956 // Branch instruction takes about 4 slots on gfx900.
957 return SCost ? 1 : 4;
958 case Instruction::CondBr:
959 // Suppose conditional branch takes additional 3 exec manipulations
960 // instructions in average.
961 return CBrCost;
962 case Instruction::Switch: {
963 const auto *SI = dyn_cast_or_null<SwitchInst>(I);
964 // Each case (including default) takes 1 cmp + 1 cbr instructions in
965 // average.
966 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
967 }
968 case Instruction::Ret:
969 return SCost ? 1 : 10;
970 }
971 return BaseT::getCFInstrCost(Opcode, CostKind, I);
972}
973
976 std::optional<FastMathFlags> FMF,
979 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
980
981 EVT OrigTy = TLI->getValueType(DL, Ty);
982
983 // Computes cost on targets that have packed math instructions(which support
984 // 16-bit types only).
985 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
986 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
987
988 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
989 return LT.first * getFullRateInstrCost();
990}
991
994 FastMathFlags FMF,
996 EVT OrigTy = TLI->getValueType(DL, Ty);
997
998 // Computes cost on targets that have packed math instructions(which support
999 // 16-bit types only).
1000 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
1001 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1002
1003 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1004 return LT.first * getHalfRateInstrCost(CostKind);
1005}
1006
1008 unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
1009 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
1010 switch (Opcode) {
1011 case Instruction::ExtractElement:
1012 case Instruction::InsertElement: {
1013 unsigned EltSize
1014 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
1015 // Dynamic indexing isn't free and is best avoided.
1016 if (Index == ~0u)
1017 return 2;
1018 if (EltSize < 32) {
1019 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
1020 return 0;
1021 // Some i8 inserts and extracts are free so we want to reduce the
1022 // cost to avoid scalarization. We limit the zero cost cases to avoid
1023 // adversely impacting all i8 vectorizing.
1024 if (EltSize == 8) {
1025 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1026 if (NumElts >= 4 && isPowerOf2_32(NumElts)) {
1027 // Extracts at indices aligned to 32-bit boundaries (0, 4, 8, 12 for
1028 // v16i8) are free as they access the low byte of each VGPR. Other
1029 // indices require bit manipulation (shifts/byte selects) and cost 1.
1030 return Index % 4 == 0 ? 0 : 1;
1031 }
1032 }
1033 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
1034 VIC);
1035 }
1036
1037 // Extracts are just reads of a subregister, so are free. Inserts are
1038 // considered free because we don't want to have any cost for scalarizing
1039 // operations, and we don't have to copy into a different register class.
1040 return 0;
1041 }
1042 default:
1043 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
1044 VIC);
1045 }
1046}
1047
1048/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
1049/// this is analyzing the collective result of all output registers. Otherwise,
1050/// this is only querying a specific result index if this returns multiple
1051/// registers in a struct.
1053 const CallInst *CI, ArrayRef<unsigned> Indices) const {
1054 // TODO: Handle complex extract indices
1055 if (Indices.size() > 1)
1056 return true;
1057
1058 const DataLayout &DL = CI->getDataLayout();
1059 const SIRegisterInfo *TRI = ST->getRegisterInfo();
1060 TargetLowering::AsmOperandInfoVector TargetConstraints =
1061 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
1062
1063 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
1064
1065 int OutputIdx = 0;
1066 for (auto &TC : TargetConstraints) {
1067 if (TC.Type != InlineAsm::isOutput)
1068 continue;
1069
1070 // Skip outputs we don't care about.
1071 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
1072 continue;
1073
1074 TLI->ComputeConstraintToUse(TC, SDValue());
1075
1076 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
1077 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
1078
1079 // For AGPR constraints null is returned on subtargets without AGPRs, so
1080 // assume divergent for null.
1081 if (!RC || !TRI->isSGPRClass(RC))
1082 return true;
1083 }
1084
1085 return false;
1086}
1087
1089 const IntrinsicInst *ReadReg) const {
1090 Metadata *MD =
1091 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
1093 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
1094
1095 // Special case registers that look like VCC.
1096 MVT VT = MVT::getVT(ReadReg->getType());
1097 if (VT == MVT::i1)
1098 return true;
1099
1100 // Special case scalar registers that start with 'v'.
1101 if (RegName.starts_with("vcc") || RegName.empty())
1102 return false;
1103
1104 // VGPR or AGPR is divergent. There aren't any specially named vector
1105 // registers.
1106 return RegName[0] == 'v' || RegName[0] == 'a';
1107}
1108
1109/// \returns true if the result of the value could potentially be
1110/// different across workitems in a wavefront.
1111bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
1112 if (const Argument *A = dyn_cast<Argument>(V))
1114
1115 // Loads from the private and flat address spaces are divergent, because
1116 // threads can execute the load instruction with the same inputs and get
1117 // different results.
1118 //
1119 // All other loads are not divergent, because if threads issue loads with the
1120 // same arguments, they will always get the same result.
1121 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
1122 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
1123 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
1124
1125 // Atomics are divergent because they are executed sequentially: when an
1126 // atomic operation refers to the same address in each thread, then each
1127 // thread after the first sees the value written by the previous thread as
1128 // original value.
1130 return true;
1131
1133 Intrinsic::ID IID = Intrinsic->getIntrinsicID();
1134 switch (IID) {
1135 case Intrinsic::read_register:
1137 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1138 unsigned SrcAS =
1139 Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
1140 unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
1141 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
1142 DstAS == AMDGPUAS::FLAT_ADDRESS &&
1143 ST->hasGloballyAddressableScratch();
1144 }
1145 case Intrinsic::amdgcn_workitem_id_y:
1146 case Intrinsic::amdgcn_workitem_id_z: {
1147 const Function *F = Intrinsic->getFunction();
1148 bool HasUniformYZ =
1149 ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);
1150 std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
1151 *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1152 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1153 }
1154 default:
1156 }
1157 }
1158
1159 // Assume all function calls are a source of divergence.
1160 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1161 if (CI->isInlineAsm())
1163 return true;
1164 }
1165
1166 // Assume all function calls are a source of divergence.
1167 if (isa<InvokeInst>(V))
1168 return true;
1169
1170 // If the target supports globally addressable scratch, the mapping from
1171 // scratch memory to the flat aperture changes therefore an address space cast
1172 // is no longer uniform.
1173 if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {
1174 return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
1175 CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
1176 ST->hasGloballyAddressableScratch();
1177 }
1178
1179 return false;
1180}
1181
1182bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
1183 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
1184 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
1185
1186 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1187 if (CI->isInlineAsm())
1189 return false;
1190 }
1191
1192 // In most cases TID / wavefrontsize is uniform.
1193 //
1194 // However, if a kernel has uneven dimesions we can have a value of
1195 // workitem-id-x divided by the wavefrontsize non-uniform. For example
1196 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1197 // packed into a same wave which gives 1 and 0 after the division by 64
1198 // respectively.
1199 //
1200 // The X dimension doesn't reset within a wave if either both the Y
1201 // and Z dimensions are of length 1, or if the X dimension's required
1202 // size is a power of 2. Note, however, if the X dimension's maximum
1203 // size is a power of 2 < the wavefront size, division by the wavefront
1204 // size is guaranteed to yield 0, so this is also a no-reset case.
1205 bool XDimDoesntResetWithinWaves = false;
1206 if (auto *I = dyn_cast<Instruction>(V)) {
1207 const Function *F = I->getFunction();
1208 XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);
1209 }
1210 using namespace llvm::PatternMatch;
1211 uint64_t C;
1213 m_ConstantInt(C))) ||
1215 m_ConstantInt(C)))) {
1216 return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
1217 }
1218
1219 Value *Mask;
1221 m_Value(Mask)))) {
1222 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
1223 ST->getWavefrontSizeLog2() &&
1224 XDimDoesntResetWithinWaves;
1225 }
1226
1227 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
1228 if (!ExtValue)
1229 return false;
1230
1231 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
1232 if (!CI)
1233 return false;
1234
1235 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1236 switch (Intrinsic->getIntrinsicID()) {
1237 default:
1238 return false;
1239 case Intrinsic::amdgcn_if:
1240 case Intrinsic::amdgcn_else: {
1241 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1242 return Indices.size() == 1 && Indices[0] == 1;
1243 }
1244 }
1245 }
1246
1247 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1248 // divergent for the overall struct return. We need to override it in the
1249 // case we're extracting an SGPR component here.
1250 if (CI->isInlineAsm())
1251 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1252
1253 return false;
1254}
1255
1257 Intrinsic::ID IID) const {
1258 switch (IID) {
1259 case Intrinsic::amdgcn_is_shared:
1260 case Intrinsic::amdgcn_is_private:
1261 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1262 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1263 case Intrinsic::amdgcn_load_to_lds:
1264 case Intrinsic::amdgcn_make_buffer_rsrc:
1265 OpIndexes.push_back(0);
1266 return true;
1267 default:
1268 return false;
1269 }
1270}
1271
1273 Value *OldV,
1274 Value *NewV) const {
1275 auto IntrID = II->getIntrinsicID();
1276 switch (IntrID) {
1277 case Intrinsic::amdgcn_is_shared:
1278 case Intrinsic::amdgcn_is_private: {
1279 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1281 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1282 LLVMContext &Ctx = NewV->getType()->getContext();
1283 ConstantInt *NewVal = (TrueAS == NewAS) ?
1285 return NewVal;
1286 }
1287 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1288 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1289 Type *DestTy = II->getType();
1290 Type *SrcTy = NewV->getType();
1291 unsigned NewAS = SrcTy->getPointerAddressSpace();
1293 return nullptr;
1294 Module *M = II->getModule();
1296 M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
1297 II->setArgOperand(0, NewV);
1298 II->setCalledFunction(NewDecl);
1299 return II;
1300 }
1301 case Intrinsic::amdgcn_load_to_lds: {
1302 Type *SrcTy = NewV->getType();
1303 Module *M = II->getModule();
1304 Function *NewDecl =
1305 Intrinsic::getOrInsertDeclaration(M, II->getIntrinsicID(), {SrcTy});
1306 II->setArgOperand(0, NewV);
1307 II->setCalledFunction(NewDecl);
1308 return II;
1309 }
1310 case Intrinsic::amdgcn_make_buffer_rsrc: {
1311 Type *SrcTy = NewV->getType();
1312 Type *DstTy = II->getType();
1313 Module *M = II->getModule();
1315 M, II->getIntrinsicID(), {DstTy, SrcTy});
1316 II->setArgOperand(0, NewV);
1317 II->setCalledFunction(NewDecl);
1318 return II;
1319 }
1320 default:
1321 return nullptr;
1322 }
1323}
1324
1326 VectorType *DstTy, VectorType *SrcTy,
1327 ArrayRef<int> Mask,
1329 int Index, VectorType *SubTp,
1331 const Instruction *CxtI) const {
1332 if (!isa<FixedVectorType>(SrcTy))
1333 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1334 SubTp);
1335
1336 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1337
1338 unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());
1339 if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1340 (ScalarSize == 16 || ScalarSize == 8)) {
1341 // Larger vector widths may require additional instructions, but are
1342 // typically cheaper than scalarized versions.
1343 //
1344 // We assume that shuffling at a register granularity can be done for free.
1345 // This is not true for vectors fed into memory instructions, but it is
1346 // effectively true for all other shuffling. The emphasis of the logic here
1347 // is to assist generic transform in cleaning up / canonicalizing those
1348 // shuffles.
1349
1350 // With op_sel VOP3P instructions freely can access the low half or high
1351 // half of a register, so any swizzle of two elements is free.
1352 if (auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy)) {
1353 unsigned NumSrcElts = SrcVecTy->getNumElements();
1354 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
1355 (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
1356 Kind == TTI::SK_PermuteSingleSrc))
1357 return 0;
1358 }
1359
1360 unsigned EltsPerReg = 32 / ScalarSize;
1361 switch (Kind) {
1362 case TTI::SK_Broadcast:
1363 // A single v_perm_b32 can be re-used for all destination registers.
1364 return 1;
1365 case TTI::SK_Reverse:
1366 // One instruction per register.
1367 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1368 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1371 if (Index % EltsPerReg == 0)
1372 return 0; // Shuffling at register granularity
1373 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1374 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1377 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1378 if (!DstVecTy)
1380 unsigned NumDstElts = DstVecTy->getNumElements();
1381 unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements();
1382 unsigned EndIndex = Index + NumInsertElts;
1383 unsigned BeginSubIdx = Index % EltsPerReg;
1384 unsigned EndSubIdx = EndIndex % EltsPerReg;
1385 unsigned Cost = 0;
1386
1387 if (BeginSubIdx != 0) {
1388 // Need to shift the inserted vector into place. The cost is the number
1389 // of destination registers overlapped by the inserted vector.
1390 Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg);
1391 }
1392
1393 // If the last register overlap is partial, there may be three source
1394 // registers feeding into it; that takes an extra instruction.
1395 if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
1396 Cost += 1;
1397
1398 return Cost;
1399 }
1400 case TTI::SK_Splice: {
1401 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1402 if (!DstVecTy)
1404 unsigned NumElts = DstVecTy->getNumElements();
1405 assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
1406 // Determine the sub-region of the result vector that requires
1407 // sub-register shuffles / mixing.
1408 unsigned EltsFromLHS = NumElts - Index;
1409 bool LHSIsAligned = (Index % EltsPerReg) == 0;
1410 bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
1411 if (LHSIsAligned && RHSIsAligned)
1412 return 0;
1413 if (LHSIsAligned && !RHSIsAligned)
1414 return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);
1415 if (!LHSIsAligned && RHSIsAligned)
1416 return divideCeil(EltsFromLHS, EltsPerReg);
1417 return divideCeil(NumElts, EltsPerReg);
1418 }
1419 default:
1420 break;
1421 }
1422
1423 if (!Mask.empty()) {
1424 unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();
1425
1426 // Generically estimate the cost by assuming that each destination
1427 // register is derived from sources via v_perm_b32 instructions if it
1428 // can't be copied as-is.
1429 //
1430 // For each destination register, derive the cost of obtaining it based
1431 // on the number of source registers that feed into it.
1432 unsigned Cost = 0;
1433 for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
1435 bool Aligned = true;
1436 for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
1437 int SrcIdx = Mask[DstIdx + I];
1438 if (SrcIdx == -1)
1439 continue;
1440 int Reg;
1441 if (SrcIdx < (int)NumSrcElts) {
1442 Reg = SrcIdx / EltsPerReg;
1443 if (SrcIdx % EltsPerReg != I)
1444 Aligned = false;
1445 } else {
1446 Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
1447 if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
1448 Aligned = false;
1449 }
1450 if (!llvm::is_contained(Regs, Reg))
1451 Regs.push_back(Reg);
1452 }
1453 if (Regs.size() >= 2)
1454 Cost += Regs.size() - 1;
1455 else if (!Aligned)
1456 Cost += 1;
1457 }
1458 return Cost;
1459 }
1460 }
1461
1462 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1463 SubTp);
1464}
1465
1466/// Whether it is profitable to sink the operands of an
1467/// Instruction I to the basic block of I.
1468/// This helps using several modifiers (like abs and neg) more often.
1470 SmallVectorImpl<Use *> &Ops) const {
1471 using namespace PatternMatch;
1472
1473 for (auto &Op : I->operands()) {
1474 // Ensure we are not already sinking this operand.
1475 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
1476 continue;
1477
1478 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) {
1479 Ops.push_back(&Op);
1480 continue;
1481 }
1482
1483 // Check for zero-cost multiple use InsertElement/ExtractElement
1484 // instructions
1485 if (Instruction *OpInst = dyn_cast<Instruction>(Op.get())) {
1486 if (OpInst->getType()->isVectorTy() && OpInst->getNumOperands() > 1) {
1487 Instruction *VecOpInst = dyn_cast<Instruction>(OpInst->getOperand(0));
1488 if (VecOpInst && VecOpInst->hasOneUse())
1489 continue;
1490
1491 if (getVectorInstrCost(OpInst->getOpcode(), OpInst->getType(),
1493 OpInst->getOperand(0),
1494 OpInst->getOperand(1)) == 0) {
1495 Ops.push_back(&Op);
1496 continue;
1497 }
1498 }
1499 }
1500
1501 if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
1502
1503 unsigned EltSize = DL.getTypeSizeInBits(
1504 cast<VectorType>(Shuffle->getType())->getElementType());
1505
1506 // For i32 (or greater) shufflevectors, these will be lowered into a
1507 // series of insert / extract elements, which will be coalesced away.
1508 if (EltSize < 16 || !ST->has16BitInsts())
1509 continue;
1510
1511 int NumSubElts, SubIndex;
1512 if (Shuffle->changesLength()) {
1513 if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1514 Ops.push_back(&Op);
1515 continue;
1516 }
1517
1518 if ((Shuffle->isExtractSubvectorMask(SubIndex) ||
1519 Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) &&
1520 !(SubIndex & 0x1)) {
1521 Ops.push_back(&Op);
1522 continue;
1523 }
1524 }
1525
1526 if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1527 Shuffle->isSingleSource()) {
1528 Ops.push_back(&Op);
1529 continue;
1530 }
1531 }
1532 }
1533
1534 return !Ops.empty();
1535}
1536
1538 const Function *Callee) const {
1539 const TargetMachine &TM = getTLI()->getTargetMachine();
1540 const GCNSubtarget *CallerST
1541 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1542 const GCNSubtarget *CalleeST
1543 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1544
1545 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1546 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1547
1548 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1549 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1550 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1551 return false;
1552
1553 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1554 // no way to support merge for backend defined attributes.
1555 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1556 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1557 if (!CallerMode.isInlineCompatible(CalleeMode))
1558 return false;
1559
1560 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1561 Callee->hasFnAttribute(Attribute::InlineHint))
1562 return true;
1563
1564 // Hack to make compile times reasonable.
1565 if (InlineMaxBB) {
1566 // Single BB does not increase total BB amount.
1567 if (Callee->size() == 1)
1568 return true;
1569 size_t BBSize = Caller->size() + Callee->size() - 1;
1570 return BBSize <= InlineMaxBB;
1571 }
1572
1573 return true;
1574}
1575
1577 const SITargetLowering *TLI,
1578 const GCNTTIImpl *TTIImpl) {
1579 const int NrOfSGPRUntilSpill = 26;
1580 const int NrOfVGPRUntilSpill = 32;
1581
1582 const DataLayout &DL = TTIImpl->getDataLayout();
1583
1584 unsigned adjustThreshold = 0;
1585 int SGPRsInUse = 0;
1586 int VGPRsInUse = 0;
1587 for (const Use &A : CB->args()) {
1588 SmallVector<EVT, 4> ValueVTs;
1589 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1590 for (auto ArgVT : ValueVTs) {
1591 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1592 CB->getContext(), CB->getCallingConv(), ArgVT);
1594 SGPRsInUse += CCRegNum;
1595 else
1596 VGPRsInUse += CCRegNum;
1597 }
1598 }
1599
1600 // The cost of passing function arguments through the stack:
1601 // 1 instruction to put a function argument on the stack in the caller.
1602 // 1 instruction to take a function argument from the stack in callee.
1603 // 1 instruction is explicitly take care of data dependencies in callee
1604 // function.
1605 InstructionCost ArgStackCost(1);
1606 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1607 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1609 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1610 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1612
1613 // The penalty cost is computed relative to the cost of instructions and does
1614 // not model any storage costs.
1615 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1616 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1617 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1618 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1619 return adjustThreshold;
1620}
1621
1622static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1623 const DataLayout &DL) {
1624 // If we have a pointer to a private array passed into a function
1625 // it will not be optimized out, leaving scratch usage.
1626 // This function calculates the total size in bytes of the memory that would
1627 // end in scratch if the call was not inlined.
1628 unsigned AllocaSize = 0;
1630 for (Value *PtrArg : CB->args()) {
1631 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1632 if (!Ty)
1633 continue;
1634
1635 unsigned AddrSpace = Ty->getAddressSpace();
1636 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1637 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1638 continue;
1639
1641 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1642 continue;
1643
1644 if (auto Size = AI->getAllocationSize(DL))
1645 AllocaSize += Size->getFixedValue();
1646 }
1647 return AllocaSize;
1648}
1649
1654
1656 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
1657
1658 // Private object passed as arguments may end up in scratch usage if the call
1659 // is not inlined. Increase the inline threshold to promote inlining.
1660 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1661 if (AllocaSize > 0)
1662 Threshold += ArgAllocaCost;
1663 return Threshold;
1664}
1665
1667 const AllocaInst *AI) const {
1668
1669 // Below the cutoff, assume that the private memory objects would be
1670 // optimized
1671 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1672 if (AllocaSize <= ArgAllocaCutoff)
1673 return 0;
1674
1675 // Above the cutoff, we give a cost to each private memory object
1676 // depending its size. If the array can be optimized by SROA this cost is not
1677 // added to the total-cost in the inliner cost analysis.
1678 //
1679 // We choose the total cost of the alloca such that their sum cancels the
1680 // bonus given in the threshold (ArgAllocaCost).
1681 //
1682 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1683 //
1684 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1685 // the single-bb bonus and the vector-bonus.
1686 //
1687 // We compensate the first two multipliers, by repeating logic from the
1688 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1689 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1690 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1691
1692 bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
1693 return BB.getTerminator()->getNumSuccessors() > 1;
1694 });
1695 if (SingleBB) {
1696 Threshold += Threshold / 2;
1697 }
1698
1699 auto ArgAllocaSize = AI->getAllocationSize(DL);
1700 if (!ArgAllocaSize)
1701 return 0;
1702
1703 // Attribute the bonus proportionally to the alloca size
1704 unsigned AllocaThresholdBonus =
1705 (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize;
1706
1707 return AllocaThresholdBonus;
1708}
1709
1712 OptimizationRemarkEmitter *ORE) const {
1713 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1714}
1715
1717 TTI::PeelingPreferences &PP) const {
1718 CommonTTI.getPeelingPreferences(L, SE, PP);
1719}
1720
1721int GCNTTIImpl::getTransInstrCost(TTI::TargetCostKind CostKind) const {
1722 return getQuarterRateInstrCost(CostKind);
1723}
1724
1725int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1726 return ST->hasFullRate64Ops()
1727 ? getFullRateInstrCost()
1728 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1729 : getQuarterRateInstrCost(CostKind);
1730}
1731
1732std::pair<InstructionCost, MVT>
1733GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1734 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1735 auto Size = DL.getTypeSizeInBits(Ty);
1736 // Maximum load or store can handle 8 dwords for scalar and 4 for
1737 // vector ALU. Let's assume anything above 8 dwords is expensive
1738 // even if legal.
1739 if (Size <= 256)
1740 return Cost;
1741
1742 Cost.first += (Size + 255) / 256;
1743 return Cost;
1744}
1745
1747 return ST->hasPrefetch() ? 128 : 0;
1748}
1749
1752}
1753
1755 const Function &F,
1756 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1757 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1758 LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1759 LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1760 LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1761 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1762 ST->getFlatWorkGroupSizes(F);
1763 LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1764 LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1765 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1766 LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1767 LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1768}
1769
1772 if (!ST->hasFeature(AMDGPU::FeatureDX10ClampAndIEEEMode))
1773 return KnownIEEEMode::On; // Only mode on gfx1170+
1774
1775 const Function *F = I.getFunction();
1776 if (!F)
1778
1779 Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");
1780 if (IEEEAttr.isValid())
1782
1783 return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
1785}
1786
1788 Align Alignment,
1789 unsigned AddressSpace,
1791 TTI::OperandValueInfo OpInfo,
1792 const Instruction *I) const {
1793 if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1794 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1795 VecTy->getElementType()->isIntegerTy(8)) {
1796 return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
1798 }
1799 }
1800 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1801 OpInfo, I);
1802}
1803
1805 if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1806 if (VecTy->getElementType()->isIntegerTy(8)) {
1807 unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1808 return divideCeil(ElementCount - 1, 4);
1809 }
1810 }
1811 return BaseT::getNumberOfParts(Tp);
1812}
1813
1816 switch (Intrinsic->getIntrinsicID()) {
1817 case Intrinsic::amdgcn_wave_shuffle:
1819 default:
1820 break;
1821 }
1822 }
1823
1824 if (isAlwaysUniform(V))
1826
1827 if (isSourceOfDivergence(V))
1829
1831}
1832
1834 StackOffset BaseOffset,
1835 bool HasBaseReg, int64_t Scale,
1836 unsigned AddrSpace) const {
1837 if (HasBaseReg && Scale != 0) {
1838 // gfx1250+ can fold base+scale*index when scale matches the memory access
1839 // size (scale_offset bit). Supported for flat/global/constant/scratch
1840 // (VMEM, max 128 bits) and constant_32bit (SMRD, capped to 128 bits here).
1841 if (getST()->hasScaleOffset() && Ty && Ty->isSized() &&
1843 AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
1844 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)) {
1845 TypeSize StoreSize = getDataLayout().getTypeStoreSize(Ty);
1846 if (TypeSize::isKnownLE(StoreSize, TypeSize::getFixed(16)) &&
1847 static_cast<int64_t>(StoreSize.getFixedValue()) == Scale)
1848 return 0;
1849 }
1850 return 1;
1851 }
1852 return BaseT::getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
1853 AddrSpace);
1854}
1855
1857 const TTI::LSRCost &B) const {
1858 // Favor lower per-iteration work over preheader/setup costs.
1859 // AMDGPU lacks rich addressing modes, so ScaleCost is folded into the
1860 // effective instruction count (base+scale*index requires a separate ADD).
1861 unsigned EffInsnsA = A.Insns + A.ScaleCost;
1862 unsigned EffInsnsB = B.Insns + B.ScaleCost;
1863
1864 return std::tie(EffInsnsA, A.NumIVMuls, A.AddRecCost, A.NumBaseAdds,
1865 A.SetupCost, A.ImmCost, A.NumRegs) <
1866 std::tie(EffInsnsB, B.NumIVMuls, B.AddRecCost, B.NumBaseAdds,
1867 B.SetupCost, B.ImmCost, B.NumRegs);
1868}
1869
1871 // isLSRCostLess de-prioritizes register count; keep consistent.
1872 return false;
1873}
1874
1876 // Prefer the baseline when LSR cannot clearly reduce per-iteration work.
1877 return true;
1878}
1879
1881 const SmallBitVector &UniformArgs) const {
1883 switch (Intrinsic->getIntrinsicID()) {
1884 case Intrinsic::amdgcn_wave_shuffle:
1885 // wave_shuffle(Value, Index): result is uniform when either Value or Index
1886 // is uniform.
1887 return UniformArgs[0] || UniformArgs[1];
1888 default:
1889 llvm_unreachable("unexpected intrinsic in isUniform");
1890 }
1891}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static cl::opt< unsigned > MemcpyLoopUnroll("amdgpu-memcpy-loop-unroll", cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " "operations when lowering statically-sized memcpy, memmove, or" "memset as a loop"), cl::init(16), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, const DataLayout &DL)
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static unsigned getNumElements(Type *Ty)
This file implements the SmallBitVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:119
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition Attributes.h:261
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
unsigned getNumberOfParts(Type *Tp) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool isInlineAsm() const
Check if this call is an inline asm statement.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
CallingConv::ID getCallingConv() const
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
This class represents a function call, abstracting a target machine's calling convention.
Conditional Branch instruction.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:579
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
ArrayRef< unsigned > getIndices() const
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool approxFunc() const
Definition FMF.h:73
Container class for subtarget features.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Account for loads of i8 vector types to have reduced cost.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
bool isUniform(const Instruction *I, const SmallBitVector &UniformArgs) const override
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
unsigned getNumberOfRegisters(unsigned RCID) const override
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool isLSRCostLess(const TTI::LSRCost &A, const TTI::LSRCost &B) const override
bool shouldPrefetchAddressSpace(unsigned AS) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool hasBranchDivergence(const Function *F=nullptr) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
unsigned getInliningThresholdMultiplier() const override
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
unsigned adjustInliningThreshold(const CallBase *CB) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool shouldDropLSRSolutionIfLessProfitable() const override
int getInliningLastCallToStaticBonus() const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
ValueUniformity getValueUniformity(const Value *V) const override
unsigned getNumberOfParts(Type *Tp) const override
When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
unsigned getMinVectorRegisterBitWidth() const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override
bool isNumRegsMajorCostOfLSR() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
LLVM_ABI bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Metadata node.
Definition Metadata.h:1080
Machine Value Type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
The optimization diagnostic interface.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
std::vector< AsmOperandInfo > AsmOperandInfoVector
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:312
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
Base class of all SIMD vector types.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
LLVM_ABI int getInstrCost()
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
auto m_Value()
Match an arbitrary value and ignore it.
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract_or_null(Y &&MD)
Extract a Value from Metadata, allowing null.
Definition Metadata.h:683
This is an optimization pass for GlobalISel generic memory operations.
@ Length
Definition DWP.cpp:557
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
InstructionCost Cost
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ FAdd
Sum of floats.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
ValueUniformity
Enum describing how values behave with respect to uniformity and divergence, to answer the question: ...
Definition Uniformity.h:18
@ AlwaysUniform
The result value is always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result value can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
@ Custom
The result value requires a custom uniformity check.
Definition Uniformity.h:31
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Parameters that control the generic loop unrolling transformation.
unsigned Threshold
The cost threshold for the unrolled loop.
bool UnrollVectorizedLoop
Disable runtime unrolling by default for vectorized loops.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...