LLVM 23.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
18#include "AMDGPUTargetMachine.h"
25#include "llvm/IR/Function.h"
26#include "llvm/IR/IRBuilder.h"
27#include "llvm/IR/IntrinsicsAMDGPU.h"
30#include <optional>
31
32using namespace llvm;
33
34#define DEBUG_TYPE "AMDGPUtti"
35
37 "amdgpu-unroll-threshold-private",
38 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
39 cl::init(2700), cl::Hidden);
40
42 "amdgpu-unroll-threshold-local",
43 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
44 cl::init(1000), cl::Hidden);
45
47 "amdgpu-unroll-threshold-if",
48 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
49 cl::init(200), cl::Hidden);
50
52 "amdgpu-unroll-runtime-local",
53 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
54 cl::init(true), cl::Hidden);
55
57 "amdgpu-unroll-max-block-to-analyze",
58 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
59 cl::init(32), cl::Hidden);
60
61static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
62 cl::Hidden, cl::init(4000),
63 cl::desc("Cost of alloca argument"));
64
65// If the amount of scratch memory to eliminate exceeds our ability to allocate
66// it into registers we gain nothing by aggressively inlining functions for that
67// heuristic.
69 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
70 cl::init(256),
71 cl::desc("Maximum alloca size to use for inline cost"));
72
73// Inliner constraint to achieve reasonable compilation time.
75 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
76 cl::desc("Maximum number of BBs allowed in a function after inlining"
77 " (compile time constraint)"));
78
79// This default unroll factor is based on microbenchmarks on gfx1030.
81 "amdgpu-memcpy-loop-unroll",
82 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
83 "operations when lowering statically-sized memcpy, memmove, or"
84 "memset as a loop"),
85 cl::init(16), cl::Hidden);
86
87static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
88 unsigned Depth = 0) {
90 if (!I)
91 return false;
92
93 for (const Value *V : I->operand_values()) {
94 if (!L->contains(I))
95 continue;
96 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
97 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
98 return SubLoop->contains(PHI); }))
99 return true;
100 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
101 return true;
102 }
103 return false;
104}
105
107 : BaseT(TM, F.getDataLayout()),
108 TargetTriple(TM->getTargetTriple()),
109 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
110 TLI(ST->getTargetLowering()) {}
111
114 OptimizationRemarkEmitter *ORE) const {
115 const Function &F = *L->getHeader()->getParent();
116 UP.Threshold =
117 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
118 UP.MaxCount = std::numeric_limits<unsigned>::max();
119 UP.Partial = true;
120
121 // Conditional branch in a loop back edge needs 3 additional exec
122 // manipulations in average.
123 UP.BEInsns += 3;
124
125 // We want to run unroll even for the loops which have been vectorized.
126 UP.UnrollVectorizedLoop = true;
127
128 // TODO: Do we want runtime unrolling?
129
130 // Maximum alloca size than can fit registers. Reserve 16 registers.
131 const unsigned MaxAlloca = (256 - 16) * 4;
132 unsigned ThresholdPrivate = UnrollThresholdPrivate;
133 unsigned ThresholdLocal = UnrollThresholdLocal;
134
135 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
136 // provided threshold value as the default for Threshold
137 if (MDNode *LoopUnrollThreshold =
138 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
139 if (LoopUnrollThreshold->getNumOperands() == 2) {
141 LoopUnrollThreshold->getOperand(1));
142 if (MetaThresholdValue) {
143 // We will also use the supplied value for PartialThreshold for now.
144 // We may introduce additional metadata if it becomes necessary in the
145 // future.
146 UP.Threshold = MetaThresholdValue->getSExtValue();
148 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
149 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
150 }
151 }
152 }
153
154 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
155 for (const BasicBlock *BB : L->getBlocks()) {
156 const DataLayout &DL = BB->getDataLayout();
157 unsigned LocalGEPsSeen = 0;
158
159 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
160 return SubLoop->contains(BB); }))
161 continue; // Block belongs to an inner loop.
162
163 for (const Instruction &I : *BB) {
164 // Unroll a loop which contains an "if" statement whose condition
165 // defined by a PHI belonging to the loop. This may help to eliminate
166 // if region and potentially even PHI itself, saving on both divergence
167 // and registers used for the PHI.
168 // Add a small bonus for each of such "if" statements.
169 if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
170 if (UP.Threshold < MaxBoost && Br->isConditional()) {
171 BasicBlock *Succ0 = Br->getSuccessor(0);
172 BasicBlock *Succ1 = Br->getSuccessor(1);
173 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
174 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
175 continue;
176 if (dependsOnLocalPhi(L, Br->getCondition())) {
178 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
179 << " for loop:\n"
180 << *L << " due to " << *Br << '\n');
181 if (UP.Threshold >= MaxBoost)
182 return;
183 }
184 }
185 continue;
186 }
187
189 if (!GEP)
190 continue;
191
192 unsigned AS = GEP->getAddressSpace();
193 unsigned Threshold = 0;
195 Threshold = ThresholdPrivate;
197 Threshold = ThresholdLocal;
198 else
199 continue;
200
201 if (UP.Threshold >= Threshold)
202 continue;
203
204 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
205 const Value *Ptr = GEP->getPointerOperand();
206 const AllocaInst *Alloca =
208 if (!Alloca || !Alloca->isStaticAlloca())
209 continue;
210 auto AllocaSize = Alloca->getAllocationSize(DL);
211 if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca)
212 continue;
213 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
215 LocalGEPsSeen++;
216 // Inhibit unroll for local memory if we have seen addressing not to
217 // a variable, most likely we will be unable to combine it.
218 // Do not unroll too deep inner loops for local memory to give a chance
219 // to unroll an outer loop for a more important reason.
220 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2)
221 continue;
222
223 const Value *V = getUnderlyingObject(GEP->getPointerOperand());
224 if (!isa<GlobalVariable>(V) && !isa<Argument>(V))
225 continue;
226
227 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
228 << *L << " due to LDS use.\n");
230 }
231
232 // Check if GEP depends on a value defined by this loop itself.
233 bool HasLoopDef = false;
234 for (const Value *Op : GEP->operands()) {
235 const Instruction *Inst = dyn_cast<Instruction>(Op);
236 if (!Inst || L->isLoopInvariant(Op))
237 continue;
238
239 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
240 return SubLoop->contains(Inst); }))
241 continue;
242 HasLoopDef = true;
243 break;
244 }
245 if (!HasLoopDef)
246 continue;
247
248 // We want to do whatever we can to limit the number of alloca
249 // instructions that make it through to the code generator. allocas
250 // require us to use indirect addressing, which is slow and prone to
251 // compiler bugs. If this loop does an address calculation on an
252 // alloca ptr, then we want to use a higher than normal loop unroll
253 // threshold. This will give SROA a better chance to eliminate these
254 // allocas.
255 //
256 // We also want to have more unrolling for local memory to let ds
257 // instructions with different offsets combine.
258 //
259 // Don't use the maximum allowed value here as it will make some
260 // programs way too big.
261 UP.Threshold = Threshold;
262 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
263 << " for loop:\n"
264 << *L << " due to " << *GEP << '\n');
265 if (UP.Threshold >= MaxBoost)
266 return;
267 }
268
269 // If we got a GEP in a small BB from inner loop then increase max trip
270 // count to analyze for better estimation cost in unroll
271 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
273 }
274}
275
280
284
285const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
286 // Codegen control options which don't matter.
287 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
288 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal,
289 AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureUnalignedAccessMode,
290
291 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
292
293 // Property of the kernel/environment which can't actually differ.
294 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
295 AMDGPU::FeatureTrapHandler,
296
297 // The default assumption needs to be ecc is enabled, but no directly
298 // exposed operations depend on it, so it can be safely inlined.
299 AMDGPU::FeatureSRAMECC,
300
301 // Perf-tuning features
302 AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops};
303
305 : BaseT(TM, F.getDataLayout()),
306 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
307 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
308 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
310 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
311 HasFP64FP16Denormals =
312 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
313}
314
316 return !F || !ST->isSingleLaneExecution(*F);
317}
318
319unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
320 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
321 // registers. See getRegisterClassForType for the implementation.
322 // In this case vector registers are not vector in terms of
323 // VGPRs, but those which can hold multiple values.
324
325 // This is really the number of registers to fill when vectorizing /
326 // interleaving loops, so we lie to avoid trying to use all registers.
327 return 4;
328}
329
332 switch (K) {
334 return TypeSize::getFixed(32);
336 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
338 return TypeSize::getScalable(0);
339 }
340 llvm_unreachable("Unsupported register kind");
341}
342
344 return 32;
345}
346
347unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
348 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
349 return 32 * 4 / ElemWidth;
350 // For a given width return the max 0number of elements that can be combined
351 // into a wider bit value:
352 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
353 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
354 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
355 : 1;
356}
357
358unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
359 unsigned ChainSizeInBytes,
360 VectorType *VecTy) const {
361 unsigned VecRegBitWidth = VF * LoadSize;
362 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
363 // TODO: Support element-size less than 32bit?
364 return 128 / LoadSize;
365
366 return VF;
367}
368
369unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
370 unsigned ChainSizeInBytes,
371 VectorType *VecTy) const {
372 unsigned VecRegBitWidth = VF * StoreSize;
373 if (VecRegBitWidth > 128)
374 return 128 / StoreSize;
375
376 return VF;
377}
378
379unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
380 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
381 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
383 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
384 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
386 return 512;
387 }
388
389 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
390 return 8 * ST->getMaxPrivateElementSize();
391
392 // Common to flat, global, local and region. Assume for unknown addrspace.
393 return 128;
394}
395
396bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
397 Align Alignment,
398 unsigned AddrSpace) const {
399 // We allow vectorization of flat stores, even though we may need to decompose
400 // them later if they may access private memory. We don't have enough context
401 // here, and legalization can handle it.
402 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
403 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
404 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
405 }
406 return true;
407}
408
409bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
410 Align Alignment,
411 unsigned AddrSpace) const {
412 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
413}
414
415bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
416 Align Alignment,
417 unsigned AddrSpace) const {
418 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
419}
420
424
426 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
427 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
428 std::optional<uint32_t> AtomicElementSize) const {
429
430 if (AtomicElementSize)
431 return Type::getIntNTy(Context, *AtomicElementSize * 8);
432
433 // 16-byte accesses achieve the highest copy throughput.
434 // If the operation has a fixed known length that is large enough, it is
435 // worthwhile to return an even wider type and let legalization lower it into
436 // multiple accesses, effectively unrolling the memcpy loop.
437 // We also rely on legalization to decompose into smaller accesses for
438 // subtargets and address spaces where it is necessary.
439 //
440 // Don't unroll if Length is not a constant, since unrolling leads to worse
441 // performance for length values that are smaller or slightly larger than the
442 // total size of the type returned here. Mitigating that would require a more
443 // complex lowering for variable-length memcpy and memmove.
444 unsigned I32EltsInVector = 4;
447 MemcpyLoopUnroll * I32EltsInVector);
448
449 return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector);
450}
451
453 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
454 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
455 Align SrcAlign, Align DestAlign,
456 std::optional<uint32_t> AtomicCpySize) const {
457
458 if (AtomicCpySize)
460 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
461 DestAlign, AtomicCpySize);
462
463 Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
464 while (RemainingBytes >= 16) {
465 OpsOut.push_back(I32x4Ty);
466 RemainingBytes -= 16;
467 }
468
469 Type *I64Ty = Type::getInt64Ty(Context);
470 while (RemainingBytes >= 8) {
471 OpsOut.push_back(I64Ty);
472 RemainingBytes -= 8;
473 }
474
475 Type *I32Ty = Type::getInt32Ty(Context);
476 while (RemainingBytes >= 4) {
477 OpsOut.push_back(I32Ty);
478 RemainingBytes -= 4;
479 }
480
481 Type *I16Ty = Type::getInt16Ty(Context);
482 while (RemainingBytes >= 2) {
483 OpsOut.push_back(I16Ty);
484 RemainingBytes -= 2;
485 }
486
487 Type *I8Ty = Type::getInt8Ty(Context);
488 while (RemainingBytes) {
489 OpsOut.push_back(I8Ty);
490 --RemainingBytes;
491 }
492}
493
495 // Disable unrolling if the loop is not vectorized.
496 // TODO: Enable this again.
497 if (VF.isScalar())
498 return 1;
499
500 return 8;
501}
502
504 MemIntrinsicInfo &Info) const {
505 switch (Inst->getIntrinsicID()) {
506 case Intrinsic::amdgcn_ds_ordered_add:
507 case Intrinsic::amdgcn_ds_ordered_swap: {
508 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
509 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
510 if (!Ordering || !Volatile)
511 return false; // Invalid.
512
513 unsigned OrderingVal = Ordering->getZExtValue();
514 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
515 return false;
516
517 Info.PtrVal = Inst->getArgOperand(0);
518 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
519 Info.ReadMem = true;
520 Info.WriteMem = true;
521 Info.IsVolatile = !Volatile->isZero();
522 return true;
523 }
524 default:
525 return false;
526 }
527}
528
530 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
532 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
533
534 // Legalize the type.
535 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
536 int ISD = TLI->InstructionOpcodeToISD(Opcode);
537
538 // Because we don't have any legal vector operations, but the legal types, we
539 // need to account for split vectors.
540 unsigned NElts = LT.second.isVector() ?
541 LT.second.getVectorNumElements() : 1;
542
543 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
544
545 switch (ISD) {
546 case ISD::SHL:
547 case ISD::SRL:
548 case ISD::SRA:
549 if (SLT == MVT::i64)
550 return get64BitInstrCost(CostKind) * LT.first * NElts;
551
552 if (ST->has16BitInsts() && SLT == MVT::i16)
553 NElts = (NElts + 1) / 2;
554
555 // i32
556 return getFullRateInstrCost() * LT.first * NElts;
557 case ISD::ADD:
558 case ISD::SUB:
559 case ISD::AND:
560 case ISD::OR:
561 case ISD::XOR:
562 if (SLT == MVT::i64) {
563 // and, or and xor are typically split into 2 VALU instructions.
564 return 2 * getFullRateInstrCost() * LT.first * NElts;
565 }
566
567 if (ST->has16BitInsts() && SLT == MVT::i16)
568 NElts = (NElts + 1) / 2;
569
570 return LT.first * NElts * getFullRateInstrCost();
571 case ISD::MUL: {
572 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
573 if (SLT == MVT::i64) {
574 const int FullRateCost = getFullRateInstrCost();
575 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
576 }
577
578 if (ST->has16BitInsts() && SLT == MVT::i16)
579 NElts = (NElts + 1) / 2;
580
581 // i32
582 return QuarterRateCost * NElts * LT.first;
583 }
584 case ISD::FMUL:
585 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
586 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
587 // fused operation.
588 if (CxtI && CxtI->hasOneUse())
589 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
590 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
591 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
592 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
594 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
596
597 // Estimate all types may be fused with contract/unsafe flags
598 const TargetOptions &Options = TLI->getTargetMachine().Options;
599 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
600 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
602 }
603 }
604 [[fallthrough]];
605 case ISD::FADD:
606 case ISD::FSUB:
607 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
608 NElts = (NElts + 1) / 2;
609 if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
610 NElts = (NElts + 1) / 2;
611 if (SLT == MVT::f64)
612 return LT.first * NElts * get64BitInstrCost(CostKind);
613
614 if (ST->has16BitInsts() && SLT == MVT::f16)
615 NElts = (NElts + 1) / 2;
616
617 if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
618 return LT.first * NElts * getFullRateInstrCost();
619 break;
620 case ISD::FDIV:
621 case ISD::FREM:
622 // FIXME: frem should be handled separately. The fdiv in it is most of it,
623 // but the current lowering is also not entirely correct.
624 if (SLT == MVT::f64) {
625 int Cost = 7 * get64BitInstrCost(CostKind) +
626 getQuarterRateInstrCost(CostKind) +
627 3 * getHalfRateInstrCost(CostKind);
628 // Add cost of workaround.
629 if (!ST->hasUsableDivScaleConditionOutput())
630 Cost += 3 * getFullRateInstrCost();
631
632 return LT.first * Cost * NElts;
633 }
634
635 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
636 // TODO: This is more complicated, unsafe flags etc.
637 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
638 (SLT == MVT::f16 && ST->has16BitInsts())) {
639 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
640 }
641 }
642
643 if (SLT == MVT::f16 && ST->has16BitInsts()) {
644 // 2 x v_cvt_f32_f16
645 // f32 rcp
646 // f32 fmul
647 // v_cvt_f16_f32
648 // f16 div_fixup
649 int Cost =
650 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
651 return LT.first * Cost * NElts;
652 }
653
654 if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
655 // Fast unsafe fdiv lowering:
656 // f32 rcp
657 // f32 fmul
658 int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
659 return LT.first * Cost * NElts;
660 }
661
662 if (SLT == MVT::f32 || SLT == MVT::f16) {
663 // 4 more v_cvt_* insts without f16 insts support
664 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
665 1 * getQuarterRateInstrCost(CostKind);
666
667 if (!HasFP32Denormals) {
668 // FP mode switches.
669 Cost += 2 * getFullRateInstrCost();
670 }
671
672 return LT.first * NElts * Cost;
673 }
674 break;
675 case ISD::FNEG:
676 // Use the backend' estimation. If fneg is not free each element will cost
677 // one additional instruction.
678 return TLI->isFNegFree(SLT) ? 0 : NElts;
679 default:
680 break;
681 }
682
683 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
684 Args, CxtI);
685}
686
687// Return true if there's a potential benefit from using v2f16/v2i16
688// instructions for an intrinsic, even if it requires nontrivial legalization.
690 switch (ID) {
691 case Intrinsic::fma:
692 case Intrinsic::fmuladd:
693 case Intrinsic::copysign:
694 case Intrinsic::minimumnum:
695 case Intrinsic::maximumnum:
696 case Intrinsic::canonicalize:
697 // There's a small benefit to using vector ops in the legalized code.
698 case Intrinsic::round:
699 case Intrinsic::uadd_sat:
700 case Intrinsic::usub_sat:
701 case Intrinsic::sadd_sat:
702 case Intrinsic::ssub_sat:
703 case Intrinsic::abs:
704 return true;
705 default:
706 return false;
707 }
708}
709
713 switch (ICA.getID()) {
714 case Intrinsic::fabs:
715 // Free source modifier in the common case.
716 return 0;
717 case Intrinsic::amdgcn_workitem_id_x:
718 case Intrinsic::amdgcn_workitem_id_y:
719 case Intrinsic::amdgcn_workitem_id_z:
720 // TODO: If hasPackedTID, or if the calling context is not an entry point
721 // there may be a bit instruction.
722 return 0;
723 case Intrinsic::amdgcn_workgroup_id_x:
724 case Intrinsic::amdgcn_workgroup_id_y:
725 case Intrinsic::amdgcn_workgroup_id_z:
726 case Intrinsic::amdgcn_lds_kernel_id:
727 case Intrinsic::amdgcn_dispatch_ptr:
728 case Intrinsic::amdgcn_dispatch_id:
729 case Intrinsic::amdgcn_implicitarg_ptr:
730 case Intrinsic::amdgcn_queue_ptr:
731 // Read from an argument register.
732 return 0;
733 default:
734 break;
735 }
736
737 Type *RetTy = ICA.getReturnType();
738
739 Intrinsic::ID IID = ICA.getID();
740 switch (IID) {
741 case Intrinsic::exp:
742 case Intrinsic::exp2:
743 case Intrinsic::exp10: {
744 // Legalize the type.
745 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
746 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
747
748 if (SLT == MVT::f64) {
749 int NumOps = 20;
750 if (IID == Intrinsic::exp)
751 ++NumOps;
752 else if (IID == Intrinsic::exp10)
753 NumOps += 3;
754
755 unsigned NElts =
756 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
757
758 return LT.first * NElts * NumOps * get64BitInstrCost(CostKind);
759 }
760
761 break;
762 }
763 default:
764 break;
765 }
766
769
770 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
771 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
772 unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
773
774 if ((ST->hasVOP3PInsts() &&
775 (SLT == MVT::f16 || SLT == MVT::i16 ||
776 (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
777 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
778 NElts = (NElts + 1) / 2;
779
780 // TODO: Get more refined intrinsic costs?
781 unsigned InstRate = getQuarterRateInstrCost(CostKind);
782
783 switch (ICA.getID()) {
784 case Intrinsic::fma:
785 case Intrinsic::fmuladd:
786 if (SLT == MVT::f64) {
787 InstRate = get64BitInstrCost(CostKind);
788 break;
789 }
790
791 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
792 InstRate = getFullRateInstrCost();
793 else {
794 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
795 : getQuarterRateInstrCost(CostKind);
796 }
797 break;
798 case Intrinsic::copysign:
799 return NElts * getFullRateInstrCost();
800 case Intrinsic::minimumnum:
801 case Intrinsic::maximumnum: {
802 // Instruction + 2 canonicalizes. For cases that need type promotion, we the
803 // promotion takes the place of the canonicalize.
804 unsigned NumOps = 3;
805 if (const IntrinsicInst *II = ICA.getInst()) {
806 // Directly legal with ieee=0
807 // TODO: Not directly legal with strictfp
809 NumOps = 1;
810 }
811
812 unsigned BaseRate =
813 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
814 InstRate = BaseRate * NumOps;
815 break;
816 }
817 case Intrinsic::canonicalize: {
818 InstRate =
819 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
820 break;
821 }
822 case Intrinsic::uadd_sat:
823 case Intrinsic::usub_sat:
824 case Intrinsic::sadd_sat:
825 case Intrinsic::ssub_sat: {
826 if (SLT == MVT::i16 || SLT == MVT::i32)
827 InstRate = getFullRateInstrCost();
828
829 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
830 if (any_of(ValidSatTys, equal_to(LT.second)))
831 NElts = 1;
832 break;
833 }
834 case Intrinsic::abs:
835 // Expansion takes 2 instructions for VALU
836 if (SLT == MVT::i16 || SLT == MVT::i32)
837 InstRate = 2 * getFullRateInstrCost();
838 break;
839 default:
840 break;
841 }
842
843 return LT.first * NElts * InstRate;
844}
845
848 const Instruction *I) const {
849 assert((I == nullptr || I->getOpcode() == Opcode) &&
850 "Opcode should reflect passed instruction.");
851 const bool SCost =
853 const int CBrCost = SCost ? 5 : 7;
854 switch (Opcode) {
855 case Instruction::Br: {
856 // Branch instruction takes about 4 slots on gfx900.
857 const auto *BI = dyn_cast_or_null<BranchInst>(I);
858 if (BI && BI->isUnconditional())
859 return SCost ? 1 : 4;
860 // Suppose conditional branch takes additional 3 exec manipulations
861 // instructions in average.
862 return CBrCost;
863 }
864 case Instruction::Switch: {
865 const auto *SI = dyn_cast_or_null<SwitchInst>(I);
866 // Each case (including default) takes 1 cmp + 1 cbr instructions in
867 // average.
868 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
869 }
870 case Instruction::Ret:
871 return SCost ? 1 : 10;
872 }
873 return BaseT::getCFInstrCost(Opcode, CostKind, I);
874}
875
878 std::optional<FastMathFlags> FMF,
881 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
882
883 EVT OrigTy = TLI->getValueType(DL, Ty);
884
885 // Computes cost on targets that have packed math instructions(which support
886 // 16-bit types only).
887 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
888 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
889
890 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
891 return LT.first * getFullRateInstrCost();
892}
893
896 FastMathFlags FMF,
898 EVT OrigTy = TLI->getValueType(DL, Ty);
899
900 // Computes cost on targets that have packed math instructions(which support
901 // 16-bit types only).
902 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
903 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
904
905 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
906 return LT.first * getHalfRateInstrCost(CostKind);
907}
908
910 unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
911 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
912 switch (Opcode) {
913 case Instruction::ExtractElement:
914 case Instruction::InsertElement: {
915 unsigned EltSize
916 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
917 if (EltSize < 32) {
918 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
919 return 0;
920 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
921 VIC);
922 }
923
924 // Extracts are just reads of a subregister, so are free. Inserts are
925 // considered free because we don't want to have any cost for scalarizing
926 // operations, and we don't have to copy into a different register class.
927
928 // Dynamic indexing isn't free and is best avoided.
929 return Index == ~0u ? 2 : 0;
930 }
931 default:
932 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
933 VIC);
934 }
935}
936
937/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
938/// this is analyzing the collective result of all output registers. Otherwise,
939/// this is only querying a specific result index if this returns multiple
940/// registers in a struct.
942 const CallInst *CI, ArrayRef<unsigned> Indices) const {
943 // TODO: Handle complex extract indices
944 if (Indices.size() > 1)
945 return true;
946
947 const DataLayout &DL = CI->getDataLayout();
948 const SIRegisterInfo *TRI = ST->getRegisterInfo();
949 TargetLowering::AsmOperandInfoVector TargetConstraints =
950 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
951
952 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
953
954 int OutputIdx = 0;
955 for (auto &TC : TargetConstraints) {
956 if (TC.Type != InlineAsm::isOutput)
957 continue;
958
959 // Skip outputs we don't care about.
960 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
961 continue;
962
963 TLI->ComputeConstraintToUse(TC, SDValue());
964
965 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
966 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
967
968 // For AGPR constraints null is returned on subtargets without AGPRs, so
969 // assume divergent for null.
970 if (!RC || !TRI->isSGPRClass(RC))
971 return true;
972 }
973
974 return false;
975}
976
978 const IntrinsicInst *ReadReg) const {
979 Metadata *MD =
980 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
982 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
983
984 // Special case registers that look like VCC.
985 MVT VT = MVT::getVT(ReadReg->getType());
986 if (VT == MVT::i1)
987 return true;
988
989 // Special case scalar registers that start with 'v'.
990 if (RegName.starts_with("vcc") || RegName.empty())
991 return false;
992
993 // VGPR or AGPR is divergent. There aren't any specially named vector
994 // registers.
995 return RegName[0] == 'v' || RegName[0] == 'a';
996}
997
998/// \returns true if the result of the value could potentially be
999/// different across workitems in a wavefront.
1000bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
1001 if (const Argument *A = dyn_cast<Argument>(V))
1003
1004 // Loads from the private and flat address spaces are divergent, because
1005 // threads can execute the load instruction with the same inputs and get
1006 // different results.
1007 //
1008 // All other loads are not divergent, because if threads issue loads with the
1009 // same arguments, they will always get the same result.
1010 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
1011 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
1012 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
1013
1014 // Atomics are divergent because they are executed sequentially: when an
1015 // atomic operation refers to the same address in each thread, then each
1016 // thread after the first sees the value written by the previous thread as
1017 // original value.
1019 return true;
1020
1022 Intrinsic::ID IID = Intrinsic->getIntrinsicID();
1023 switch (IID) {
1024 case Intrinsic::read_register:
1026 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1027 unsigned SrcAS =
1028 Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
1029 unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
1030 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
1031 DstAS == AMDGPUAS::FLAT_ADDRESS &&
1032 ST->hasGloballyAddressableScratch();
1033 }
1034 case Intrinsic::amdgcn_workitem_id_y:
1035 case Intrinsic::amdgcn_workitem_id_z: {
1036 const Function *F = Intrinsic->getFunction();
1037 bool HasUniformYZ =
1038 ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);
1039 std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
1040 *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1041 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1042 }
1043 default:
1045 }
1046 }
1047
1048 // Assume all function calls are a source of divergence.
1049 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1050 if (CI->isInlineAsm())
1052 return true;
1053 }
1054
1055 // Assume all function calls are a source of divergence.
1056 if (isa<InvokeInst>(V))
1057 return true;
1058
1059 // If the target supports globally addressable scratch, the mapping from
1060 // scratch memory to the flat aperture changes therefore an address space cast
1061 // is no longer uniform.
1062 if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {
1063 return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
1064 CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
1065 ST->hasGloballyAddressableScratch();
1066 }
1067
1068 return false;
1069}
1070
1071bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
1072 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
1073 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
1074
1075 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1076 if (CI->isInlineAsm())
1078 return false;
1079 }
1080
1081 // In most cases TID / wavefrontsize is uniform.
1082 //
1083 // However, if a kernel has uneven dimesions we can have a value of
1084 // workitem-id-x divided by the wavefrontsize non-uniform. For example
1085 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1086 // packed into a same wave which gives 1 and 0 after the division by 64
1087 // respectively.
1088 //
1089 // The X dimension doesn't reset within a wave if either both the Y
1090 // and Z dimensions are of length 1, or if the X dimension's required
1091 // size is a power of 2. Note, however, if the X dimension's maximum
1092 // size is a power of 2 < the wavefront size, division by the wavefront
1093 // size is guaranteed to yield 0, so this is also a no-reset case.
1094 bool XDimDoesntResetWithinWaves = false;
1095 if (auto *I = dyn_cast<Instruction>(V)) {
1096 const Function *F = I->getFunction();
1097 XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);
1098 }
1099 using namespace llvm::PatternMatch;
1100 uint64_t C;
1102 m_ConstantInt(C))) ||
1104 m_ConstantInt(C)))) {
1105 return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
1106 }
1107
1108 Value *Mask;
1110 m_Value(Mask)))) {
1111 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
1112 ST->getWavefrontSizeLog2() &&
1113 XDimDoesntResetWithinWaves;
1114 }
1115
1116 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
1117 if (!ExtValue)
1118 return false;
1119
1120 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
1121 if (!CI)
1122 return false;
1123
1124 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1125 switch (Intrinsic->getIntrinsicID()) {
1126 default:
1127 return false;
1128 case Intrinsic::amdgcn_if:
1129 case Intrinsic::amdgcn_else: {
1130 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1131 return Indices.size() == 1 && Indices[0] == 1;
1132 }
1133 }
1134 }
1135
1136 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1137 // divergent for the overall struct return. We need to override it in the
1138 // case we're extracting an SGPR component here.
1139 if (CI->isInlineAsm())
1140 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1141
1142 return false;
1143}
1144
1146 Intrinsic::ID IID) const {
1147 switch (IID) {
1148 case Intrinsic::amdgcn_is_shared:
1149 case Intrinsic::amdgcn_is_private:
1150 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1151 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1152 case Intrinsic::amdgcn_load_to_lds:
1153 case Intrinsic::amdgcn_make_buffer_rsrc:
1154 OpIndexes.push_back(0);
1155 return true;
1156 default:
1157 return false;
1158 }
1159}
1160
1162 Value *OldV,
1163 Value *NewV) const {
1164 auto IntrID = II->getIntrinsicID();
1165 switch (IntrID) {
1166 case Intrinsic::amdgcn_is_shared:
1167 case Intrinsic::amdgcn_is_private: {
1168 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1170 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1171 LLVMContext &Ctx = NewV->getType()->getContext();
1172 ConstantInt *NewVal = (TrueAS == NewAS) ?
1174 return NewVal;
1175 }
1176 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1177 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1178 Type *DestTy = II->getType();
1179 Type *SrcTy = NewV->getType();
1180 unsigned NewAS = SrcTy->getPointerAddressSpace();
1182 return nullptr;
1183 Module *M = II->getModule();
1185 M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
1186 II->setArgOperand(0, NewV);
1187 II->setCalledFunction(NewDecl);
1188 return II;
1189 }
1190 case Intrinsic::amdgcn_load_to_lds: {
1191 Type *SrcTy = NewV->getType();
1192 Module *M = II->getModule();
1193 Function *NewDecl =
1194 Intrinsic::getOrInsertDeclaration(M, II->getIntrinsicID(), {SrcTy});
1195 II->setArgOperand(0, NewV);
1196 II->setCalledFunction(NewDecl);
1197 return II;
1198 }
1199 case Intrinsic::amdgcn_make_buffer_rsrc: {
1200 Type *SrcTy = NewV->getType();
1201 Type *DstTy = II->getType();
1202 Module *M = II->getModule();
1204 M, II->getIntrinsicID(), {DstTy, SrcTy});
1205 II->setArgOperand(0, NewV);
1206 II->setCalledFunction(NewDecl);
1207 return II;
1208 }
1209 default:
1210 return nullptr;
1211 }
1212}
1213
1215 VectorType *DstTy, VectorType *SrcTy,
1216 ArrayRef<int> Mask,
1218 int Index, VectorType *SubTp,
1220 const Instruction *CxtI) const {
1221 if (!isa<FixedVectorType>(SrcTy))
1222 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1223 SubTp);
1224
1225 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1226
1227 unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());
1228 if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1229 (ScalarSize == 16 || ScalarSize == 8)) {
1230 // Larger vector widths may require additional instructions, but are
1231 // typically cheaper than scalarized versions.
1232 //
1233 // We assume that shuffling at a register granularity can be done for free.
1234 // This is not true for vectors fed into memory instructions, but it is
1235 // effectively true for all other shuffling. The emphasis of the logic here
1236 // is to assist generic transform in cleaning up / canonicalizing those
1237 // shuffles.
1238
1239 // With op_sel VOP3P instructions freely can access the low half or high
1240 // half of a register, so any swizzle of two elements is free.
1241 if (auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy)) {
1242 unsigned NumSrcElts = SrcVecTy->getNumElements();
1243 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
1244 (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
1245 Kind == TTI::SK_PermuteSingleSrc))
1246 return 0;
1247 }
1248
1249 unsigned EltsPerReg = 32 / ScalarSize;
1250 switch (Kind) {
1251 case TTI::SK_Broadcast:
1252 // A single v_perm_b32 can be re-used for all destination registers.
1253 return 1;
1254 case TTI::SK_Reverse:
1255 // One instruction per register.
1256 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1257 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1260 if (Index % EltsPerReg == 0)
1261 return 0; // Shuffling at register granularity
1262 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1263 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1266 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1267 if (!DstVecTy)
1269 unsigned NumDstElts = DstVecTy->getNumElements();
1270 unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements();
1271 unsigned EndIndex = Index + NumInsertElts;
1272 unsigned BeginSubIdx = Index % EltsPerReg;
1273 unsigned EndSubIdx = EndIndex % EltsPerReg;
1274 unsigned Cost = 0;
1275
1276 if (BeginSubIdx != 0) {
1277 // Need to shift the inserted vector into place. The cost is the number
1278 // of destination registers overlapped by the inserted vector.
1279 Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg);
1280 }
1281
1282 // If the last register overlap is partial, there may be three source
1283 // registers feeding into it; that takes an extra instruction.
1284 if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
1285 Cost += 1;
1286
1287 return Cost;
1288 }
1289 case TTI::SK_Splice: {
1290 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1291 if (!DstVecTy)
1293 unsigned NumElts = DstVecTy->getNumElements();
1294 assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
1295 // Determine the sub-region of the result vector that requires
1296 // sub-register shuffles / mixing.
1297 unsigned EltsFromLHS = NumElts - Index;
1298 bool LHSIsAligned = (Index % EltsPerReg) == 0;
1299 bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
1300 if (LHSIsAligned && RHSIsAligned)
1301 return 0;
1302 if (LHSIsAligned && !RHSIsAligned)
1303 return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);
1304 if (!LHSIsAligned && RHSIsAligned)
1305 return divideCeil(EltsFromLHS, EltsPerReg);
1306 return divideCeil(NumElts, EltsPerReg);
1307 }
1308 default:
1309 break;
1310 }
1311
1312 if (!Mask.empty()) {
1313 unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();
1314
1315 // Generically estimate the cost by assuming that each destination
1316 // register is derived from sources via v_perm_b32 instructions if it
1317 // can't be copied as-is.
1318 //
1319 // For each destination register, derive the cost of obtaining it based
1320 // on the number of source registers that feed into it.
1321 unsigned Cost = 0;
1322 for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
1324 bool Aligned = true;
1325 for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
1326 int SrcIdx = Mask[DstIdx + I];
1327 if (SrcIdx == -1)
1328 continue;
1329 int Reg;
1330 if (SrcIdx < (int)NumSrcElts) {
1331 Reg = SrcIdx / EltsPerReg;
1332 if (SrcIdx % EltsPerReg != I)
1333 Aligned = false;
1334 } else {
1335 Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
1336 if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
1337 Aligned = false;
1338 }
1339 if (!llvm::is_contained(Regs, Reg))
1340 Regs.push_back(Reg);
1341 }
1342 if (Regs.size() >= 2)
1343 Cost += Regs.size() - 1;
1344 else if (!Aligned)
1345 Cost += 1;
1346 }
1347 return Cost;
1348 }
1349 }
1350
1351 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1352 SubTp);
1353}
1354
1355/// Whether it is profitable to sink the operands of an
1356/// Instruction I to the basic block of I.
1357/// This helps using several modifiers (like abs and neg) more often.
1359 SmallVectorImpl<Use *> &Ops) const {
1360 using namespace PatternMatch;
1361
1362 for (auto &Op : I->operands()) {
1363 // Ensure we are not already sinking this operand.
1364 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
1365 continue;
1366
1367 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) {
1368 Ops.push_back(&Op);
1369 continue;
1370 }
1371
1372 // Check for zero-cost multiple use InsertElement/ExtractElement
1373 // instructions
1374 if (Instruction *OpInst = dyn_cast<Instruction>(Op.get())) {
1375 if (OpInst->getType()->isVectorTy() && OpInst->getNumOperands() > 1) {
1376 Instruction *VecOpInst = dyn_cast<Instruction>(OpInst->getOperand(0));
1377 if (VecOpInst && VecOpInst->hasOneUse())
1378 continue;
1379
1380 if (getVectorInstrCost(OpInst->getOpcode(), OpInst->getType(),
1382 OpInst->getOperand(0),
1383 OpInst->getOperand(1)) == 0) {
1384 Ops.push_back(&Op);
1385 continue;
1386 }
1387 }
1388 }
1389
1390 if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
1391
1392 unsigned EltSize = DL.getTypeSizeInBits(
1393 cast<VectorType>(Shuffle->getType())->getElementType());
1394
1395 // For i32 (or greater) shufflevectors, these will be lowered into a
1396 // series of insert / extract elements, which will be coalesced away.
1397 if (EltSize < 16 || !ST->has16BitInsts())
1398 continue;
1399
1400 int NumSubElts, SubIndex;
1401 if (Shuffle->changesLength()) {
1402 if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1403 Ops.push_back(&Op);
1404 continue;
1405 }
1406
1407 if ((Shuffle->isExtractSubvectorMask(SubIndex) ||
1408 Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) &&
1409 !(SubIndex & 0x1)) {
1410 Ops.push_back(&Op);
1411 continue;
1412 }
1413 }
1414
1415 if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1416 Shuffle->isSingleSource()) {
1417 Ops.push_back(&Op);
1418 continue;
1419 }
1420 }
1421 }
1422
1423 return !Ops.empty();
1424}
1425
1427 const Function *Callee) const {
1428 const TargetMachine &TM = getTLI()->getTargetMachine();
1429 const GCNSubtarget *CallerST
1430 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1431 const GCNSubtarget *CalleeST
1432 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1433
1434 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1435 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1436
1437 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1438 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1439 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1440 return false;
1441
1442 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1443 // no way to support merge for backend defined attributes.
1444 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1445 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1446 if (!CallerMode.isInlineCompatible(CalleeMode))
1447 return false;
1448
1449 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1450 Callee->hasFnAttribute(Attribute::InlineHint))
1451 return true;
1452
1453 // Hack to make compile times reasonable.
1454 if (InlineMaxBB) {
1455 // Single BB does not increase total BB amount.
1456 if (Callee->size() == 1)
1457 return true;
1458 size_t BBSize = Caller->size() + Callee->size() - 1;
1459 return BBSize <= InlineMaxBB;
1460 }
1461
1462 return true;
1463}
1464
1466 const SITargetLowering *TLI,
1467 const GCNTTIImpl *TTIImpl) {
1468 const int NrOfSGPRUntilSpill = 26;
1469 const int NrOfVGPRUntilSpill = 32;
1470
1471 const DataLayout &DL = TTIImpl->getDataLayout();
1472
1473 unsigned adjustThreshold = 0;
1474 int SGPRsInUse = 0;
1475 int VGPRsInUse = 0;
1476 for (const Use &A : CB->args()) {
1477 SmallVector<EVT, 4> ValueVTs;
1478 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1479 for (auto ArgVT : ValueVTs) {
1480 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1481 CB->getContext(), CB->getCallingConv(), ArgVT);
1483 SGPRsInUse += CCRegNum;
1484 else
1485 VGPRsInUse += CCRegNum;
1486 }
1487 }
1488
1489 // The cost of passing function arguments through the stack:
1490 // 1 instruction to put a function argument on the stack in the caller.
1491 // 1 instruction to take a function argument from the stack in callee.
1492 // 1 instruction is explicitly take care of data dependencies in callee
1493 // function.
1494 InstructionCost ArgStackCost(1);
1495 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1496 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1498 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1499 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1501
1502 // The penalty cost is computed relative to the cost of instructions and does
1503 // not model any storage costs.
1504 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1505 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1506 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1507 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1508 return adjustThreshold;
1509}
1510
1511static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1512 const DataLayout &DL) {
1513 // If we have a pointer to a private array passed into a function
1514 // it will not be optimized out, leaving scratch usage.
1515 // This function calculates the total size in bytes of the memory that would
1516 // end in scratch if the call was not inlined.
1517 unsigned AllocaSize = 0;
1519 for (Value *PtrArg : CB->args()) {
1520 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1521 if (!Ty)
1522 continue;
1523
1524 unsigned AddrSpace = Ty->getAddressSpace();
1525 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1526 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1527 continue;
1528
1530 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1531 continue;
1532
1533 if (auto Size = AI->getAllocationSize(DL))
1534 AllocaSize += Size->getFixedValue();
1535 }
1536 return AllocaSize;
1537}
1538
1543
1545 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
1546
1547 // Private object passed as arguments may end up in scratch usage if the call
1548 // is not inlined. Increase the inline threshold to promote inlining.
1549 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1550 if (AllocaSize > 0)
1551 Threshold += ArgAllocaCost;
1552 return Threshold;
1553}
1554
1556 const AllocaInst *AI) const {
1557
1558 // Below the cutoff, assume that the private memory objects would be
1559 // optimized
1560 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1561 if (AllocaSize <= ArgAllocaCutoff)
1562 return 0;
1563
1564 // Above the cutoff, we give a cost to each private memory object
1565 // depending its size. If the array can be optimized by SROA this cost is not
1566 // added to the total-cost in the inliner cost analysis.
1567 //
1568 // We choose the total cost of the alloca such that their sum cancels the
1569 // bonus given in the threshold (ArgAllocaCost).
1570 //
1571 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1572 //
1573 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1574 // the single-bb bonus and the vector-bonus.
1575 //
1576 // We compensate the first two multipliers, by repeating logic from the
1577 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1578 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1579 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1580
1581 bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
1582 return BB.getTerminator()->getNumSuccessors() > 1;
1583 });
1584 if (SingleBB) {
1585 Threshold += Threshold / 2;
1586 }
1587
1588 auto ArgAllocaSize = AI->getAllocationSize(DL);
1589 if (!ArgAllocaSize)
1590 return 0;
1591
1592 // Attribute the bonus proportionally to the alloca size
1593 unsigned AllocaThresholdBonus =
1594 (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize;
1595
1596 return AllocaThresholdBonus;
1597}
1598
1601 OptimizationRemarkEmitter *ORE) const {
1602 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1603}
1604
1606 TTI::PeelingPreferences &PP) const {
1607 CommonTTI.getPeelingPreferences(L, SE, PP);
1608}
1609
1610int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1611 return ST->hasFullRate64Ops()
1612 ? getFullRateInstrCost()
1613 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1614 : getQuarterRateInstrCost(CostKind);
1615}
1616
1617std::pair<InstructionCost, MVT>
1618GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1619 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1620 auto Size = DL.getTypeSizeInBits(Ty);
1621 // Maximum load or store can handle 8 dwords for scalar and 4 for
1622 // vector ALU. Let's assume anything above 8 dwords is expensive
1623 // even if legal.
1624 if (Size <= 256)
1625 return Cost;
1626
1627 Cost.first += (Size + 255) / 256;
1628 return Cost;
1629}
1630
1632 return ST->hasPrefetch() ? 128 : 0;
1633}
1634
1637}
1638
1640 const Function &F,
1641 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1642 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1643 LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1644 LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1645 LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1646 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1647 ST->getFlatWorkGroupSizes(F);
1648 LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1649 LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1650 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1651 LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1652 LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1653}
1654
1657 if (!ST->hasIEEEMode()) // Only mode on gfx12
1658 return KnownIEEEMode::On;
1659
1660 const Function *F = I.getFunction();
1661 if (!F)
1663
1664 Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");
1665 if (IEEEAttr.isValid())
1667
1668 return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
1670}
1671
1673 Align Alignment,
1674 unsigned AddressSpace,
1676 TTI::OperandValueInfo OpInfo,
1677 const Instruction *I) const {
1678 if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1679 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1680 VecTy->getElementType()->isIntegerTy(8)) {
1681 return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
1683 }
1684 }
1685 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1686 OpInfo, I);
1687}
1688
1690 if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1691 if (VecTy->getElementType()->isIntegerTy(8)) {
1692 unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1693 return divideCeil(ElementCount - 1, 4);
1694 }
1695 }
1696 return BaseT::getNumberOfParts(Tp);
1697}
1698
1701 if (isAlwaysUniform(V))
1703
1704 if (isSourceOfDivergence(V))
1706
1708}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
The AMDGPU TargetMachine interface definition for hw codegen targets.
static cl::opt< unsigned > MemcpyLoopUnroll("amdgpu-memcpy-loop-unroll", cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " "operations when lowering statically-sized memcpy, memmove, or" "memset as a loop"), cl::init(16), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, const DataLayout &DL)
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static unsigned getNumElements(Type *Ty)
#define LLVM_DEBUG(...)
Definition Debug.h:114
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition Attributes.h:261
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
unsigned getNumberOfParts(Type *Tp) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Conditional or Unconditional Branch instruction.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool isInlineAsm() const
Check if this call is an inline asm statement.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
CallingConv::ID getCallingConv() const
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
This class represents a function call, abstracting a target machine's calling convention.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
ArrayRef< unsigned > getIndices() const
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Container class for subtarget features.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Account for loads of i8 vector types to have reduced cost.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
unsigned getNumberOfRegisters(unsigned RCID) const override
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool shouldPrefetchAddressSpace(unsigned AS) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool hasBranchDivergence(const Function *F=nullptr) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
unsigned getInliningThresholdMultiplier() const override
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
unsigned adjustInliningThreshold(const CallBase *CB) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
int getInliningLastCallToStaticBonus() const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
unsigned getNumberOfParts(Type *Tp) const override
When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
unsigned getMinVectorRegisterBitWidth() const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
InstructionUniformity getInstructionUniformity(const Value *V) const override
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
LLVM_ABI bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Metadata node.
Definition Metadata.h:1080
Machine Value Type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
The optimization diagnostic interface.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::vector< AsmOperandInfo > AsmOperandInfoVector
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:295
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
Base class of all SIMD vector types.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
LLVM_ABI int getInstrCost()
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract_or_null(Y &&MD)
Extract a Value from Metadata, allowing null.
Definition Metadata.h:683
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Length
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
InstructionCost Cost
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ FAdd
Sum of floats.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Parameters that control the generic loop unrolling transformation.
unsigned Threshold
The cost threshold for the unrolled loop.
bool UnrollVectorizedLoop
Disable runtime unrolling by default for vectorized loops.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...