LLVM 23.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
18#include "AMDGPUSubtarget.h"
19#include "AMDGPUTargetMachine.h"
27#include "llvm/IR/Function.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include <optional>
33
34using namespace llvm;
35
36#define DEBUG_TYPE "AMDGPUtti"
37
39 "amdgpu-unroll-threshold-private",
40 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
41 cl::init(2700), cl::Hidden);
42
44 "amdgpu-unroll-threshold-local",
45 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
46 cl::init(1000), cl::Hidden);
47
49 "amdgpu-unroll-threshold-if",
50 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
51 cl::init(200), cl::Hidden);
52
54 "amdgpu-unroll-runtime-local",
55 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
56 cl::init(true), cl::Hidden);
57
59 "amdgpu-unroll-max-block-to-analyze",
60 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
61 cl::init(32), cl::Hidden);
62
63static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
64 cl::Hidden, cl::init(4000),
65 cl::desc("Cost of alloca argument"));
66
67// If the amount of scratch memory to eliminate exceeds our ability to allocate
68// it into registers we gain nothing by aggressively inlining functions for that
69// heuristic.
71 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
72 cl::init(256),
73 cl::desc("Maximum alloca size to use for inline cost"));
74
75// Inliner constraint to achieve reasonable compilation time.
77 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
78 cl::desc("Maximum number of BBs allowed in a function after inlining"
79 " (compile time constraint)"));
80
81// This default unroll factor is based on microbenchmarks on gfx1030.
83 "amdgpu-memcpy-loop-unroll",
84 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
85 "operations when lowering statically-sized memcpy, memmove, or"
86 "memset as a loop"),
87 cl::init(16), cl::Hidden);
88
89static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
90 unsigned Depth = 0) {
92 if (!I)
93 return false;
94
95 if (!L->contains(I))
96 return false;
97 for (const Value *V : I->operand_values()) {
98 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
99 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
100 return SubLoop->contains(PHI); }))
101 return true;
102 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
103 return true;
104 }
105 return false;
106}
107
109 : BaseT(TM, F.getDataLayout()),
110 TargetTriple(TM->getTargetTriple()),
111 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
112 TLI(ST->getTargetLowering()) {}
113
116 OptimizationRemarkEmitter *ORE) const {
117 const Function &F = *L->getHeader()->getParent();
118 UP.Threshold =
119 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
120 UP.MaxCount = std::numeric_limits<unsigned>::max();
121 UP.Partial = true;
122
123 // Conditional branch in a loop back edge needs 3 additional exec
124 // manipulations in average.
125 UP.BEInsns += 3;
126
127 // We want to run unroll even for the loops which have been vectorized.
128 UP.UnrollVectorizedLoop = true;
129
130 // Enable runtime unrolling for loops whose trip count is not known at
131 // compile time.
132 UP.Runtime = true;
133
134 // Maximum alloca size than can fit registers. Reserve 16 registers.
135 const unsigned MaxAlloca = (256 - 16) * 4;
136 unsigned ThresholdPrivate = UnrollThresholdPrivate;
137 unsigned ThresholdLocal = UnrollThresholdLocal;
138
139 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
140 // provided threshold value as the default for Threshold
141 if (MDNode *LoopUnrollThreshold =
142 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
143 if (LoopUnrollThreshold->getNumOperands() == 2) {
145 LoopUnrollThreshold->getOperand(1));
146 if (MetaThresholdValue) {
147 // We will also use the supplied value for PartialThreshold for now.
148 // We may introduce additional metadata if it becomes necessary in the
149 // future.
150 UP.Threshold = MetaThresholdValue->getSExtValue();
152 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
153 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
154 }
155 }
156 }
157
158 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
159 for (const BasicBlock *BB : L->getBlocks()) {
160 const DataLayout &DL = BB->getDataLayout();
161 unsigned LocalGEPsSeen = 0;
162
163 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
164 return SubLoop->contains(BB); }))
165 continue; // Block belongs to an inner loop.
166
167 for (const Instruction &I : *BB) {
168 // Unroll a loop which contains an "if" statement whose condition
169 // defined by a PHI belonging to the loop. This may help to eliminate
170 // if region and potentially even PHI itself, saving on both divergence
171 // and registers used for the PHI.
172 // Add a small bonus for each of such "if" statements.
173 if (const CondBrInst *Br = dyn_cast<CondBrInst>(&I)) {
174 if (UP.Threshold < MaxBoost) {
175 BasicBlock *Succ0 = Br->getSuccessor(0);
176 BasicBlock *Succ1 = Br->getSuccessor(1);
177 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
178 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
179 continue;
180 if (dependsOnLocalPhi(L, Br->getCondition())) {
182 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
183 << " for loop:\n"
184 << *L << " due to " << *Br << '\n');
185 if (UP.Threshold >= MaxBoost)
186 return;
187 }
188 }
189 continue;
190 }
191
193 if (!GEP)
194 continue;
195
196 unsigned AS = GEP->getAddressSpace();
197 unsigned Threshold = 0;
199 Threshold = ThresholdPrivate;
201 Threshold = ThresholdLocal;
202 else
203 continue;
204
205 if (UP.Threshold >= Threshold)
206 continue;
207
208 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
209 const Value *Ptr = GEP->getPointerOperand();
210 const AllocaInst *Alloca =
212 if (!Alloca || !Alloca->isStaticAlloca())
213 continue;
214 auto AllocaSize = Alloca->getAllocationSize(DL);
215 if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca)
216 continue;
217 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
219 LocalGEPsSeen++;
220 // Inhibit unroll for local memory if we have seen addressing not to
221 // a variable, most likely we will be unable to combine it.
222 // Do not unroll too deep inner loops for local memory to give a chance
223 // to unroll an outer loop for a more important reason.
224 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
225 (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
226 !isa<Argument>(GEP->getPointerOperand())))
227 continue;
228 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
229 << *L << " due to LDS use.\n");
231 }
232
233 // Check if GEP depends on a value defined by this loop itself.
234 bool HasLoopDef = false;
235 for (const Value *Op : GEP->operands()) {
236 const Instruction *Inst = dyn_cast<Instruction>(Op);
237 if (!Inst || L->isLoopInvariant(Op))
238 continue;
239
240 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
241 return SubLoop->contains(Inst); }))
242 continue;
243 HasLoopDef = true;
244 break;
245 }
246 if (!HasLoopDef)
247 continue;
248
249 // We want to do whatever we can to limit the number of alloca
250 // instructions that make it through to the code generator. allocas
251 // require us to use indirect addressing, which is slow and prone to
252 // compiler bugs. If this loop does an address calculation on an
253 // alloca ptr, then we want to use a higher than normal loop unroll
254 // threshold. This will give SROA a better chance to eliminate these
255 // allocas.
256 //
257 // We also want to have more unrolling for local memory to let ds
258 // instructions with different offsets combine.
259 //
260 // Don't use the maximum allowed value here as it will make some
261 // programs way too big.
262 UP.Threshold = Threshold;
263 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
264 << " for loop:\n"
265 << *L << " due to " << *GEP << '\n');
266 if (UP.Threshold >= MaxBoost)
267 return;
268 }
269
270 // If we got a GEP in a small BB from inner loop then increase max trip
271 // count to analyze for better estimation cost in unroll
272 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
274 }
275}
276
281
285
286const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
287 // Codegen control options which don't matter.
288 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
289 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal,
290 AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureUnalignedAccessMode,
291
292 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
293
294 // Property of the kernel/environment which can't actually differ.
295 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
296 AMDGPU::FeatureXNACKOnOffModes, AMDGPU::FeatureSupportsXNACK,
297 AMDGPU::FeatureTrapHandler,
298
299 // The default assumption needs to be ecc is enabled, but no directly
300 // exposed operations depend on it, so it can be safely inlined.
301 AMDGPU::FeatureSRAMECC,
302
303 // Perf-tuning features
304 AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops};
305
307 : BaseT(TM, F.getDataLayout()),
308 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
309 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
310 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
312 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
313 HasFP64FP16Denormals =
314 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
315}
316
318 return !F || !ST->isSingleLaneExecution(*F);
319}
320
321unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
322 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
323 // registers. See getRegisterClassForType for the implementation.
324 // In this case vector registers are not vector in terms of
325 // VGPRs, but those which can hold multiple values.
326
327 // This is really the number of registers to fill when vectorizing /
328 // interleaving loops, so we lie to avoid trying to use all registers.
329 return 4;
330}
331
334 switch (K) {
336 return TypeSize::getFixed(32);
338 return TypeSize::getFixed((ST->hasPackedFP64Ops() || ST->hasPackedU64Ops())
339 ? 128
340 : ST->hasPackedFP32Ops() ? 64
341 : 32);
343 return TypeSize::getScalable(0);
344 }
345 llvm_unreachable("Unsupported register kind");
346}
347
349 return 32;
350}
351
352unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
353 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
354 return 32 * 4 / ElemWidth;
355 // For a given width return the max 0number of elements that can be combined
356 // into a wider bit value:
357 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
358 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
359 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
360 : (ElemWidth == 64 &&
361 (ST->hasPackedFP64Ops() || ST->hasPackedU64Ops()))
362 ? 2
363 : 1;
364}
365
367 // The integer inst-count heuristic causes regressions on gfx94x and gfx950
368 // because 2-element vector trees that pass the scalar/vector instruction
369 // count comparison still widen scalar moves (e.g. v_mov_b32 to v_mov_b64)
370 // after codegen, increasing register pressure and throughput cost without
371 // reducing the total instruction count.
372 return !ST->hasGFX940Insts() && !ST->hasGFX950Insts();
373}
374
375unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
376 unsigned ChainSizeInBytes,
377 VectorType *VecTy) const {
378 unsigned VecRegBitWidth = VF * LoadSize;
379 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
380 // TODO: Support element-size less than 32bit?
381 return 128 / LoadSize;
382
383 return VF;
384}
385
386unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
387 unsigned ChainSizeInBytes,
388 VectorType *VecTy) const {
389 unsigned VecRegBitWidth = VF * StoreSize;
390 if (VecRegBitWidth > 128)
391 return 128 / StoreSize;
392
393 return VF;
394}
395
396unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
397 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
398 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
400 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
401 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
403 return 512;
404 }
405
406 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
407 return 8 * ST->getMaxPrivateElementSize();
408
409 // Common to flat, global, local and region. Assume for unknown addrspace.
410 return 128;
411}
412
413bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
414 Align Alignment,
415 unsigned AddrSpace) const {
416 // We allow vectorization of flat stores, even though we may need to decompose
417 // them later if they may access private memory. We don't have enough context
418 // here, and legalization can handle it.
419 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
420 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
421 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
422 }
423 return true;
424}
425
426bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
427 Align Alignment,
428 unsigned AddrSpace) const {
429 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
430}
431
432bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
433 Align Alignment,
434 unsigned AddrSpace) const {
435 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
436}
437
441
443 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
444 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
445 std::optional<uint32_t> AtomicElementSize) const {
446
447 if (AtomicElementSize)
448 return Type::getIntNTy(Context, *AtomicElementSize * 8);
449
450 // 16-byte accesses achieve the highest copy throughput.
451 // If the operation has a fixed known length that is large enough, it is
452 // worthwhile to return an even wider type and let legalization lower it into
453 // multiple accesses, effectively unrolling the memcpy loop.
454 // We also rely on legalization to decompose into smaller accesses for
455 // subtargets and address spaces where it is necessary.
456 //
457 // Don't unroll if Length is not a constant, since unrolling leads to worse
458 // performance for length values that are smaller or slightly larger than the
459 // total size of the type returned here. Mitigating that would require a more
460 // complex lowering for variable-length memcpy and memmove.
461 unsigned I32EltsInVector = 4;
464 MemcpyLoopUnroll * I32EltsInVector);
465
466 return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector);
467}
468
470 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
471 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
472 Align SrcAlign, Align DestAlign,
473 std::optional<uint32_t> AtomicCpySize) const {
474
475 if (AtomicCpySize)
477 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
478 DestAlign, AtomicCpySize);
479
480 Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
481 while (RemainingBytes >= 16) {
482 OpsOut.push_back(I32x4Ty);
483 RemainingBytes -= 16;
484 }
485
486 Type *I64Ty = Type::getInt64Ty(Context);
487 while (RemainingBytes >= 8) {
488 OpsOut.push_back(I64Ty);
489 RemainingBytes -= 8;
490 }
491
492 Type *I32Ty = Type::getInt32Ty(Context);
493 while (RemainingBytes >= 4) {
494 OpsOut.push_back(I32Ty);
495 RemainingBytes -= 4;
496 }
497
498 Type *I16Ty = Type::getInt16Ty(Context);
499 while (RemainingBytes >= 2) {
500 OpsOut.push_back(I16Ty);
501 RemainingBytes -= 2;
502 }
503
504 Type *I8Ty = Type::getInt8Ty(Context);
505 while (RemainingBytes) {
506 OpsOut.push_back(I8Ty);
507 --RemainingBytes;
508 }
509}
510
512 // Disable unrolling if the loop is not vectorized.
513 // TODO: Enable this again.
514 if (VF.isScalar())
515 return 1;
516
517 return 8;
518}
519
521 MemIntrinsicInfo &Info) const {
522 switch (Inst->getIntrinsicID()) {
523 case Intrinsic::amdgcn_ds_ordered_add:
524 case Intrinsic::amdgcn_ds_ordered_swap: {
525 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
526 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
527 if (!Ordering || !Volatile)
528 return false; // Invalid.
529
530 unsigned OrderingVal = Ordering->getZExtValue();
531 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
532 return false;
533
534 Info.PtrVal = Inst->getArgOperand(0);
535 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
536 Info.ReadMem = true;
537 Info.WriteMem = true;
538 Info.IsVolatile = !Volatile->isZero();
539 return true;
540 }
541 default:
542 return false;
543 }
544}
545
547 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
549 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
550
551 // Legalize the type.
552 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
553 int ISD = TLI->InstructionOpcodeToISD(Opcode);
554
555 // Because we don't have any legal vector operations, but the legal types, we
556 // need to account for split vectors.
557 unsigned NElts = LT.second.isVector() ?
558 LT.second.getVectorNumElements() : 1;
559
560 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
561
562 switch (ISD) {
563 case ISD::SHL:
564 case ISD::SRL:
565 case ISD::SRA:
566 if (SLT == MVT::i64)
567 return get64BitInstrCost(CostKind) * LT.first * NElts;
568
569 if (ST->has16BitInsts() && SLT == MVT::i16)
570 NElts = (NElts + 1) / 2;
571
572 // i32
573 return getFullRateInstrCost() * LT.first * NElts;
574 case ISD::ADD:
575 case ISD::SUB:
576 if (SLT == MVT::i64 && ST->hasPackedU64Ops())
577 NElts = (NElts + 1) / 2;
578 [[fallthrough]];
579 case ISD::AND:
580 case ISD::OR:
581 case ISD::XOR:
582 if (SLT == MVT::i64) {
583 // and, or and xor are typically split into 2 VALU instructions.
584 return 2 * getFullRateInstrCost() * LT.first * NElts;
585 }
586
587 if (ST->has16BitInsts() && SLT == MVT::i16)
588 NElts = (NElts + 1) / 2;
589
590 return LT.first * NElts * getFullRateInstrCost();
591 case ISD::MUL: {
592 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
593 if (SLT == MVT::i64) {
594 const int FullRateCost = getFullRateInstrCost();
595 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
596 }
597
598 if (ST->has16BitInsts() && SLT == MVT::i16)
599 NElts = (NElts + 1) / 2;
600
601 // i32
602 return QuarterRateCost * NElts * LT.first;
603 }
604 case ISD::FMUL:
605 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
606 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
607 // fused operation.
608 if (CxtI && CxtI->hasOneUse())
609 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
610 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
611 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
612 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
614 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
616
617 // Estimate all types may be fused with contract/unsafe flags
618 const TargetOptions &Options = TLI->getTargetMachine().Options;
619 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
620 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
622 }
623 }
624 [[fallthrough]];
625 case ISD::FADD:
626 case ISD::FSUB:
627 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
628 NElts = (NElts + 1) / 2;
629 if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
630 NElts = (NElts + 1) / 2;
631 if (SLT == MVT::f64) {
632 if (ST->hasPackedFP64Ops())
633 NElts = (NElts + 1) / 2;
634 return LT.first * NElts * get64BitInstrCost(CostKind);
635 }
636
637 if (ST->has16BitInsts() && SLT == MVT::f16)
638 NElts = (NElts + 1) / 2;
639
640 if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
641 return LT.first * NElts * getFullRateInstrCost();
642 break;
643 case ISD::FDIV:
644 case ISD::FREM:
645 // FIXME: frem should be handled separately. The fdiv in it is most of it,
646 // but the current lowering is also not entirely correct.
647 if (SLT == MVT::f64) {
648 int Cost = 7 * get64BitInstrCost(CostKind) +
649 getQuarterRateInstrCost(CostKind) +
650 3 * getHalfRateInstrCost(CostKind);
651 // Add cost of workaround.
652 if (!ST->hasUsableDivScaleConditionOutput())
653 Cost += 3 * getFullRateInstrCost();
654
655 return LT.first * Cost * NElts;
656 }
657
658 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
659 // TODO: This is more complicated, unsafe flags etc.
660 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
661 (SLT == MVT::f16 && ST->has16BitInsts())) {
662 return LT.first * getTransInstrCost(CostKind) * NElts;
663 }
664 }
665
666 if (SLT == MVT::f16 && ST->has16BitInsts()) {
667 // 2 x v_cvt_f32_f16
668 // f32 rcp
669 // f32 fmul
670 // v_cvt_f16_f32
671 // f16 div_fixup
672 int Cost = 4 * getFullRateInstrCost() + 2 * getTransInstrCost(CostKind);
673 return LT.first * Cost * NElts;
674 }
675
676 if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
677 // Fast unsafe fdiv lowering:
678 // f32 rcp
679 // f32 fmul
680 int Cost = getTransInstrCost(CostKind) + getFullRateInstrCost();
681 return LT.first * Cost * NElts;
682 }
683
684 if (SLT == MVT::f32 || SLT == MVT::f16) {
685 // 4 more v_cvt_* insts without f16 insts support
686 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
687 1 * getTransInstrCost(CostKind);
688
689 if (!HasFP32Denormals) {
690 // FP mode switches.
691 Cost += 2 * getFullRateInstrCost();
692 }
693
694 return LT.first * NElts * Cost;
695 }
696 break;
697 case ISD::FNEG:
698 // Use the backend' estimation. If fneg is not free each element will cost
699 // one additional instruction.
700 return TLI->isFNegFree(SLT) ? 0 : NElts;
701 default:
702 break;
703 }
704
705 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
706 Args, CxtI);
707}
708
709// Return true if there's a potential benefit from using v2f16/v2i16
710// instructions for an intrinsic, even if it requires nontrivial legalization.
712 switch (ID) {
713 case Intrinsic::fma:
714 case Intrinsic::fmuladd:
715 case Intrinsic::copysign:
716 case Intrinsic::minimumnum:
717 case Intrinsic::maximumnum:
718 case Intrinsic::canonicalize:
719 // There's a small benefit to using vector ops in the legalized code.
720 case Intrinsic::round:
721 case Intrinsic::uadd_sat:
722 case Intrinsic::usub_sat:
723 case Intrinsic::sadd_sat:
724 case Intrinsic::ssub_sat:
725 case Intrinsic::abs:
726 return true;
727 default:
728 return false;
729 }
730}
731
735 switch (ICA.getID()) {
736 case Intrinsic::fabs:
737 // Free source modifier in the common case.
738 return 0;
739 case Intrinsic::amdgcn_workitem_id_x:
740 case Intrinsic::amdgcn_workitem_id_y:
741 case Intrinsic::amdgcn_workitem_id_z:
742 // TODO: If hasPackedTID, or if the calling context is not an entry point
743 // there may be a bit instruction.
744 return 0;
745 case Intrinsic::amdgcn_workgroup_id_x:
746 case Intrinsic::amdgcn_workgroup_id_y:
747 case Intrinsic::amdgcn_workgroup_id_z:
748 case Intrinsic::amdgcn_lds_kernel_id:
749 case Intrinsic::amdgcn_dispatch_ptr:
750 case Intrinsic::amdgcn_dispatch_id:
751 case Intrinsic::amdgcn_implicitarg_ptr:
752 case Intrinsic::amdgcn_queue_ptr:
753 // Read from an argument register.
754 return 0;
755 default:
756 break;
757 }
758
759 Type *RetTy = ICA.getReturnType();
760
761 Intrinsic::ID IID = ICA.getID();
762 switch (IID) {
763 case Intrinsic::exp:
764 case Intrinsic::exp2:
765 case Intrinsic::exp10: {
766 // Legalize the type.
767 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
768 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
769 unsigned NElts =
770 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
771
772 if (SLT == MVT::f64) {
773 unsigned NumOps = 20;
774 if (IID == Intrinsic::exp)
775 ++NumOps;
776 else if (IID == Intrinsic::exp10)
777 NumOps += 3;
778
779 return LT.first * NElts * NumOps * get64BitInstrCost(CostKind);
780 }
781
782 if (SLT == MVT::f32) {
783 unsigned NumFullRateOps = 0;
784 // v_exp_f32 (transcendental).
785 unsigned NumTransOps = 1;
786
787 if (!ICA.getFlags().approxFunc() && IID != Intrinsic::exp2) {
788 // Non-AFN exp/exp10: range reduction + v_exp_f32 + ldexp +
789 // overflow/underflow checks (lowerFEXP). Denorm is also handled.
790 // FMA preamble: ~13 full-rate ops; non-FMA: ~17.
791 NumFullRateOps = ST->hasFastFMAF32() ? 13 : 17;
792 } else {
793 if (IID == Intrinsic::exp) {
794 // lowerFEXPUnsafe: fmul (base conversion) + v_exp_f32.
795 NumFullRateOps = 1;
796 } else if (IID == Intrinsic::exp10) {
797 // lowerFEXP10Unsafe: 3 fmul + 2 v_exp_f32 (double-exp2).
798 NumFullRateOps = 3;
799 NumTransOps = 2;
800 }
801 // Denorm scaling adds setcc + select + fadd + select + fmul.
802 if (HasFP32Denormals)
803 NumFullRateOps += 5;
804 }
805
806 InstructionCost Cost = NumFullRateOps * getFullRateInstrCost() +
807 NumTransOps * getTransInstrCost(CostKind);
808 return LT.first * NElts * Cost;
809 }
810
811 break;
812 }
813 case Intrinsic::log:
814 case Intrinsic::log2:
815 case Intrinsic::log10: {
816 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
817 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
818 unsigned NElts =
819 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
820
821 if (SLT == MVT::f32) {
822 unsigned NumFullRateOps = 0;
823
824 if (IID == Intrinsic::log2) {
825 // LowerFLOG2: just v_log_f32.
826 } else if (ICA.getFlags().approxFunc()) {
827 // LowerFLOGUnsafe: v_log_f32 + fmul (base conversion).
828 NumFullRateOps = 1;
829 } else {
830 // LowerFLOGCommon non-AFN: v_log_f32 + extended-precision
831 // multiply + finite check.
832 NumFullRateOps = ST->hasFastFMAF32() ? 8 : 11;
833 }
834
835 if (HasFP32Denormals)
836 NumFullRateOps += 5;
837
839 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
840 return LT.first * NElts * Cost;
841 }
842
843 break;
844 }
845 case Intrinsic::sin:
846 case Intrinsic::cos: {
847 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
848 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
849 unsigned NElts =
850 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
851
852 if (SLT == MVT::f32) {
853 // LowerTrig: fmul(1/2pi) + v_sin/v_cos.
854 unsigned NumFullRateOps = ST->hasTrigReducedRange() ? 2 : 1;
855
857 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
858 return LT.first * NElts * Cost;
859 }
860
861 break;
862 }
863 case Intrinsic::sqrt: {
864 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
865 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
866 unsigned NElts =
867 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
868
869 if (SLT == MVT::f32) {
870 unsigned NumFullRateOps = 0;
871
872 if (!ICA.getFlags().approxFunc()) {
873 // lowerFSQRTF32 non-AFN: v_sqrt_f32 + refinement + scale fixup.
874 NumFullRateOps = HasFP32Denormals ? 17 : 16;
875 }
876
878 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
879 return LT.first * NElts * Cost;
880 }
881
882 break;
883 }
884 default:
885 break;
886 }
887
890
891 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
892 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
893 unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
894
895 if ((ST->hasVOP3PInsts() &&
896 (SLT == MVT::f16 || SLT == MVT::i16 ||
897 (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
898 (ST->hasPackedFP64Ops() && SLT == MVT::f64) ||
899 (ST->hasPackedU64Ops() && SLT == MVT::i64)) {
900 NElts = (NElts + 1) / 2;
901 } else if (SLT == MVT::f32) {
902 bool HasPk2FP32Op = ST->hasPackedFP32Ops() &&
903 IID != Intrinsic::minimumnum &&
904 IID != Intrinsic::maximumnum;
905 NElts = HasPk2FP32Op ? (NElts + 1) / 2 : NElts;
906 }
907
908 // TODO: Get more refined intrinsic costs?
909 unsigned InstRate = getQuarterRateInstrCost(CostKind);
910
911 switch (ICA.getID()) {
912 case Intrinsic::fma:
913 case Intrinsic::fmuladd:
914 if (SLT == MVT::f64) {
915 InstRate = get64BitInstrCost(CostKind);
916 break;
917 }
918
919 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
920 InstRate = getFullRateInstrCost();
921 else {
922 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
923 : getQuarterRateInstrCost(CostKind);
924 }
925 break;
926 case Intrinsic::copysign:
927 return NElts * getFullRateInstrCost();
928 case Intrinsic::minimumnum:
929 case Intrinsic::maximumnum: {
930 // Instruction + 2 canonicalizes. For cases that need type promotion, we the
931 // promotion takes the place of the canonicalize.
932 unsigned NumOps = 3;
933 if (const IntrinsicInst *II = ICA.getInst()) {
934 // Directly legal with ieee=0
935 // TODO: Not directly legal with strictfp
937 NumOps = 1;
938 }
939
940 unsigned BaseRate =
941 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
942 InstRate = BaseRate * NumOps;
943 break;
944 }
945 case Intrinsic::canonicalize: {
946 InstRate =
947 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
948 break;
949 }
950 case Intrinsic::uadd_sat:
951 case Intrinsic::usub_sat:
952 case Intrinsic::sadd_sat:
953 case Intrinsic::ssub_sat: {
954 if (SLT == MVT::i16 || SLT == MVT::i32)
955 InstRate = getFullRateInstrCost();
956
957 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
958 if (any_of(ValidSatTys, equal_to(LT.second)))
959 NElts = 1;
960 break;
961 }
962 case Intrinsic::abs:
963 // Expansion takes 2 instructions for VALU
964 if (SLT == MVT::i16 || SLT == MVT::i32)
965 InstRate = 2 * getFullRateInstrCost();
966 break;
967 default:
968 break;
969 }
970
971 return LT.first * NElts * InstRate;
972}
973
976 const Instruction *I) const {
977 assert((I == nullptr || I->getOpcode() == Opcode) &&
978 "Opcode should reflect passed instruction.");
979 const bool SCost =
981 const int CBrCost = SCost ? 5 : 7;
982 switch (Opcode) {
983 case Instruction::UncondBr:
984 // Branch instruction takes about 4 slots on gfx900.
985 return SCost ? 1 : 4;
986 case Instruction::CondBr:
987 // Suppose conditional branch takes additional 3 exec manipulations
988 // instructions in average.
989 return CBrCost;
990 case Instruction::Switch: {
991 const auto *SI = dyn_cast_or_null<SwitchInst>(I);
992 // Each case (including default) takes 1 cmp + 1 cbr instructions in
993 // average.
994 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
995 }
996 case Instruction::Ret:
997 return SCost ? 1 : 10;
998 }
999 return BaseT::getCFInstrCost(Opcode, CostKind, I);
1000}
1001
1004 std::optional<FastMathFlags> FMF,
1007 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1008
1009 EVT OrigTy = TLI->getValueType(DL, Ty);
1010
1011 // Computes cost on targets that have packed math instructions(which support
1012 // 16-bit types only).
1013 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
1014 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1015
1016 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1017 return LT.first * getFullRateInstrCost();
1018}
1019
1022 FastMathFlags FMF,
1024 EVT OrigTy = TLI->getValueType(DL, Ty);
1025
1026 // Computes cost on targets that have packed math instructions(which support
1027 // 16-bit types only).
1028 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
1029 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1030
1031 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1032 return LT.first * getHalfRateInstrCost(CostKind);
1033}
1034
1036 unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
1037 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
1038 switch (Opcode) {
1039 case Instruction::ExtractElement:
1040 case Instruction::InsertElement: {
1041 unsigned EltSize
1042 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
1043 // Dynamic indexing isn't free and is best avoided.
1044 if (Index == ~0u)
1045 return 2;
1046 if (EltSize < 32) {
1047 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
1048 return 0;
1049 // Extract element sequences of consecutive i8 values that match a
1050 // register size are free most likely. It is not possible to know
1051 // if this extract is part of a consecutive sequence so this may
1052 // apply more generally.
1053 if (Opcode == Instruction::ExtractElement && EltSize == 8) {
1054 if (auto *FVTy = dyn_cast<FixedVectorType>(ValTy)) {
1055 unsigned NumElts = FVTy->getNumElements();
1056 if (NumElts >= 4 && isPowerOf2_32(NumElts))
1057 return 0;
1058 }
1059 }
1060 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
1061 VIC);
1062 }
1063
1064 // Extracts are just reads of a subregister, so are free. Inserts are
1065 // considered free because we don't want to have any cost for scalarizing
1066 // operations, and we don't have to copy into a different register class.
1067 return 0;
1068 }
1069 default:
1070 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
1071 VIC);
1072 }
1073}
1074
1075/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
1076/// this is analyzing the collective result of all output registers. Otherwise,
1077/// this is only querying a specific result index if this returns multiple
1078/// registers in a struct.
1080 const CallInst *CI, ArrayRef<unsigned> Indices) const {
1081 // TODO: Handle complex extract indices
1082 if (Indices.size() > 1)
1083 return true;
1084
1085 const DataLayout &DL = CI->getDataLayout();
1086 const SIRegisterInfo *TRI = ST->getRegisterInfo();
1087 TargetLowering::AsmOperandInfoVector TargetConstraints =
1088 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
1089
1090 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
1091
1092 int OutputIdx = 0;
1093 for (auto &TC : TargetConstraints) {
1094 if (TC.Type != InlineAsm::isOutput)
1095 continue;
1096
1097 // Skip outputs we don't care about.
1098 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
1099 continue;
1100
1101 TLI->ComputeConstraintToUse(TC, SDValue());
1102
1103 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
1104 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
1105
1106 // For AGPR constraints null is returned on subtargets without AGPRs, so
1107 // assume divergent for null.
1108 if (!RC || !TRI->isSGPRClass(RC))
1109 return true;
1110 }
1111
1112 return false;
1113}
1114
1116 const IntrinsicInst *ReadReg) const {
1117 Metadata *MD =
1118 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
1120 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
1121
1122 // Special case registers that look like VCC.
1123 MVT VT = MVT::getVT(ReadReg->getType());
1124 if (VT == MVT::i1)
1125 return true;
1126
1127 // Special case scalar registers that start with 'v'.
1128 if (RegName.starts_with("vcc") || RegName.empty())
1129 return false;
1130
1131 // VGPR or AGPR is divergent. There aren't any specially named vector
1132 // registers.
1133 return RegName[0] == 'v' || RegName[0] == 'a';
1134}
1135
1136/// \returns true if the result of the value could potentially be
1137/// different across workitems in a wavefront.
1138bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
1139 if (const Argument *A = dyn_cast<Argument>(V))
1141
1142 // Loads from the private and flat address spaces are divergent, because
1143 // threads can execute the load instruction with the same inputs and get
1144 // different results.
1145 //
1146 // All other loads are not divergent, because if threads issue loads with the
1147 // same arguments, they will always get the same result.
1148 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
1149 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
1150 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
1151
1152 // Atomics are divergent because they are executed sequentially: when an
1153 // atomic operation refers to the same address in each thread, then each
1154 // thread after the first sees the value written by the previous thread as
1155 // original value.
1157 return true;
1158
1160 Intrinsic::ID IID = Intrinsic->getIntrinsicID();
1161 switch (IID) {
1162 case Intrinsic::read_register:
1164 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1165 unsigned SrcAS =
1166 Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
1167 unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
1168 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
1169 DstAS == AMDGPUAS::FLAT_ADDRESS &&
1170 ST->hasGloballyAddressableScratch();
1171 }
1172 case Intrinsic::amdgcn_workitem_id_y:
1173 case Intrinsic::amdgcn_workitem_id_z: {
1174 const Function *F = Intrinsic->getFunction();
1175 bool HasUniformYZ =
1176 ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);
1177 std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
1178 *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1179 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1180 }
1181 default:
1183 }
1184 }
1185
1186 // Assume all function calls are a source of divergence.
1187 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1188 if (CI->isInlineAsm())
1190 return true;
1191 }
1192
1193 // Assume all function calls are a source of divergence.
1194 if (isa<InvokeInst>(V))
1195 return true;
1196
1197 // If the target supports globally addressable scratch, the mapping from
1198 // scratch memory to the flat aperture changes therefore an address space cast
1199 // is no longer uniform.
1200 if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {
1201 return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
1202 CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
1203 ST->hasGloballyAddressableScratch();
1204 }
1205
1206 return false;
1207}
1208
1209bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
1210 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
1211 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
1212
1213 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1214 if (CI->isInlineAsm())
1216 return false;
1217 }
1218
1219 // In most cases TID / wavefrontsize is uniform.
1220 //
1221 // However, if a kernel has uneven dimesions we can have a value of
1222 // workitem-id-x divided by the wavefrontsize non-uniform. For example
1223 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1224 // packed into a same wave which gives 1 and 0 after the division by 64
1225 // respectively.
1226 //
1227 // The X dimension doesn't reset within a wave if either both the Y
1228 // and Z dimensions are of length 1, or if the X dimension's required
1229 // size is a power of 2. Note, however, if the X dimension's maximum
1230 // size is a power of 2 < the wavefront size, division by the wavefront
1231 // size is guaranteed to yield 0, so this is also a no-reset case.
1232 bool XDimDoesntResetWithinWaves = false;
1233 if (auto *I = dyn_cast<Instruction>(V)) {
1234 const Function *F = I->getFunction();
1235 XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);
1236 }
1237 using namespace llvm::PatternMatch;
1238 uint64_t C;
1240 m_ConstantInt(C))) ||
1242 m_ConstantInt(C)))) {
1243 return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
1244 }
1245
1246 Value *Mask;
1248 m_Value(Mask)))) {
1249 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
1250 ST->getWavefrontSizeLog2() &&
1251 XDimDoesntResetWithinWaves;
1252 }
1253
1254 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
1255 if (!ExtValue)
1256 return false;
1257
1258 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
1259 if (!CI)
1260 return false;
1261
1262 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1263 switch (Intrinsic->getIntrinsicID()) {
1264 default:
1265 return false;
1266 case Intrinsic::amdgcn_if:
1267 case Intrinsic::amdgcn_else: {
1268 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1269 return Indices.size() == 1 && Indices[0] == 1;
1270 }
1271 }
1272 }
1273
1274 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1275 // divergent for the overall struct return. We need to override it in the
1276 // case we're extracting an SGPR component here.
1277 if (CI->isInlineAsm())
1278 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1279
1280 return false;
1281}
1282
1284 Intrinsic::ID IID) const {
1285 switch (IID) {
1286 case Intrinsic::amdgcn_is_shared:
1287 case Intrinsic::amdgcn_is_private:
1288 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1289 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1290 case Intrinsic::amdgcn_load_to_lds:
1291 case Intrinsic::amdgcn_make_buffer_rsrc:
1292 OpIndexes.push_back(0);
1293 return true;
1294 default:
1295 return false;
1296 }
1297}
1298
1300 Value *OldV,
1301 Value *NewV) const {
1302 auto IntrID = II->getIntrinsicID();
1303 switch (IntrID) {
1304 case Intrinsic::amdgcn_is_shared:
1305 case Intrinsic::amdgcn_is_private: {
1306 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1308 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1309 LLVMContext &Ctx = NewV->getType()->getContext();
1310 ConstantInt *NewVal = (TrueAS == NewAS) ?
1312 return NewVal;
1313 }
1314 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1315 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1316 Type *DestTy = II->getType();
1317 Type *SrcTy = NewV->getType();
1318 unsigned NewAS = SrcTy->getPointerAddressSpace();
1320 return nullptr;
1321 Module *M = II->getModule();
1323 M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
1324 II->setArgOperand(0, NewV);
1325 II->setCalledFunction(NewDecl);
1326 return II;
1327 }
1328 case Intrinsic::amdgcn_load_to_lds: {
1329 Type *SrcTy = NewV->getType();
1330 Module *M = II->getModule();
1331 Function *NewDecl =
1332 Intrinsic::getOrInsertDeclaration(M, II->getIntrinsicID(), {SrcTy});
1333 II->setArgOperand(0, NewV);
1334 II->setCalledFunction(NewDecl);
1335 return II;
1336 }
1337 case Intrinsic::amdgcn_make_buffer_rsrc: {
1338 Type *SrcTy = NewV->getType();
1339 Type *DstTy = II->getType();
1340 Module *M = II->getModule();
1342 M, II->getIntrinsicID(), {DstTy, SrcTy});
1343 II->setArgOperand(0, NewV);
1344 II->setCalledFunction(NewDecl);
1345 return II;
1346 }
1347 default:
1348 return nullptr;
1349 }
1350}
1351
1353 VectorType *DstTy, VectorType *SrcTy,
1354 ArrayRef<int> Mask,
1356 int Index, VectorType *SubTp,
1358 const Instruction *CxtI) const {
1359 if (!isa<FixedVectorType>(SrcTy))
1360 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1361 SubTp);
1362
1363 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1364
1365 unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());
1366 if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1367 (ScalarSize == 16 || ScalarSize == 8)) {
1368 // Larger vector widths may require additional instructions, but are
1369 // typically cheaper than scalarized versions.
1370 //
1371 // We assume that shuffling at a register granularity can be done for free.
1372 // This is not true for vectors fed into memory instructions, but it is
1373 // effectively true for all other shuffling. The emphasis of the logic here
1374 // is to assist generic transform in cleaning up / canonicalizing those
1375 // shuffles.
1376
1377 // With op_sel VOP3P instructions freely can access the low half or high
1378 // half of a register, so any swizzle of two elements is free.
1379 if (auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy)) {
1380 unsigned NumSrcElts = SrcVecTy->getNumElements();
1381 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
1382 (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
1383 Kind == TTI::SK_PermuteSingleSrc))
1384 return 0;
1385 }
1386
1387 unsigned EltsPerReg = 32 / ScalarSize;
1388 switch (Kind) {
1389 case TTI::SK_Broadcast:
1390 // A single v_perm_b32 can be re-used for all destination registers.
1391 return 1;
1392 case TTI::SK_Reverse:
1393 // One instruction per register.
1394 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1395 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1398 if (Index % EltsPerReg == 0)
1399 return 0; // Shuffling at register granularity
1400 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1401 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1404 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1405 if (!DstVecTy)
1407 unsigned NumDstElts = DstVecTy->getNumElements();
1408 unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements();
1409 unsigned EndIndex = Index + NumInsertElts;
1410 unsigned BeginSubIdx = Index % EltsPerReg;
1411 unsigned EndSubIdx = EndIndex % EltsPerReg;
1412 unsigned Cost = 0;
1413
1414 if (BeginSubIdx != 0) {
1415 // Need to shift the inserted vector into place. The cost is the number
1416 // of destination registers overlapped by the inserted vector.
1417 Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg);
1418 }
1419
1420 // If the last register overlap is partial, there may be three source
1421 // registers feeding into it; that takes an extra instruction.
1422 if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
1423 Cost += 1;
1424
1425 return Cost;
1426 }
1427 case TTI::SK_Splice: {
1428 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1429 if (!DstVecTy)
1431 unsigned NumElts = DstVecTy->getNumElements();
1432 assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
1433 // Determine the sub-region of the result vector that requires
1434 // sub-register shuffles / mixing.
1435 unsigned EltsFromLHS = NumElts - Index;
1436 bool LHSIsAligned = (Index % EltsPerReg) == 0;
1437 bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
1438 if (LHSIsAligned && RHSIsAligned)
1439 return 0;
1440 if (LHSIsAligned && !RHSIsAligned)
1441 return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);
1442 if (!LHSIsAligned && RHSIsAligned)
1443 return divideCeil(EltsFromLHS, EltsPerReg);
1444 return divideCeil(NumElts, EltsPerReg);
1445 }
1446 default:
1447 break;
1448 }
1449
1450 if (!Mask.empty()) {
1451 unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();
1452
1453 // Generically estimate the cost by assuming that each destination
1454 // register is derived from sources via v_perm_b32 instructions if it
1455 // can't be copied as-is.
1456 //
1457 // For each destination register, derive the cost of obtaining it based
1458 // on the number of source registers that feed into it.
1459 unsigned Cost = 0;
1460 for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
1462 bool Aligned = true;
1463 for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
1464 int SrcIdx = Mask[DstIdx + I];
1465 if (SrcIdx == -1)
1466 continue;
1467 int Reg;
1468 if (SrcIdx < (int)NumSrcElts) {
1469 Reg = SrcIdx / EltsPerReg;
1470 if (SrcIdx % EltsPerReg != I)
1471 Aligned = false;
1472 } else {
1473 Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
1474 if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
1475 Aligned = false;
1476 }
1477 if (!llvm::is_contained(Regs, Reg))
1478 Regs.push_back(Reg);
1479 }
1480 if (Regs.size() >= 2)
1481 Cost += Regs.size() - 1;
1482 else if (!Aligned)
1483 Cost += 1;
1484 }
1485 return Cost;
1486 }
1487 }
1488
1489 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1490 SubTp);
1491}
1492
1493/// Whether it is profitable to sink the operands of an
1494/// Instruction I to the basic block of I.
1495/// This helps using several modifiers (like abs and neg) more often.
1497 SmallVectorImpl<Use *> &Ops) const {
1498 using namespace PatternMatch;
1499
1500 for (auto &Op : I->operands()) {
1501 // Ensure we are not already sinking this operand.
1502 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
1503 continue;
1504
1505 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) {
1506 Ops.push_back(&Op);
1507 continue;
1508 }
1509
1510 // Check for zero-cost multiple use InsertElement/ExtractElement
1511 // instructions
1512 if (Instruction *OpInst = dyn_cast<Instruction>(Op.get())) {
1513 if (OpInst->getType()->isVectorTy() && OpInst->getNumOperands() > 1) {
1514 Instruction *VecOpInst = dyn_cast<Instruction>(OpInst->getOperand(0));
1515 if (VecOpInst && VecOpInst->hasOneUse())
1516 continue;
1517
1518 if (getVectorInstrCost(OpInst->getOpcode(), OpInst->getType(),
1520 OpInst->getOperand(0),
1521 OpInst->getOperand(1)) == 0) {
1522 Ops.push_back(&Op);
1523 continue;
1524 }
1525 }
1526 }
1527
1528 if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
1529
1530 unsigned EltSize = DL.getTypeSizeInBits(
1531 cast<VectorType>(Shuffle->getType())->getElementType());
1532
1533 // For i32 (or greater) shufflevectors, these will be lowered into a
1534 // series of insert / extract elements, which will be coalesced away.
1535 if (EltSize < 16 || !ST->has16BitInsts())
1536 continue;
1537
1538 int NumSubElts, SubIndex;
1539 if (Shuffle->changesLength()) {
1540 if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1541 Ops.push_back(&Op);
1542 continue;
1543 }
1544
1545 if ((Shuffle->isExtractSubvectorMask(SubIndex) ||
1546 Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) &&
1547 !(SubIndex & 0x1)) {
1548 Ops.push_back(&Op);
1549 continue;
1550 }
1551 }
1552
1553 if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1554 Shuffle->isSingleSource()) {
1555 Ops.push_back(&Op);
1556 continue;
1557 }
1558 }
1559 }
1560
1561 return !Ops.empty();
1562}
1563
1565 const Function *Callee) const {
1566 const TargetMachine &TM = getTLI()->getTargetMachine();
1567 const GCNSubtarget *CallerST
1568 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1569 const GCNSubtarget *CalleeST
1570 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1571
1572 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1573 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1574
1575 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1576 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1577 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1578 return false;
1579
1580 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1581 // no way to support merge for backend defined attributes.
1582 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1583 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1584 if (!CallerMode.isInlineCompatible(CalleeMode))
1585 return false;
1586
1587 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1588 Callee->hasFnAttribute(Attribute::InlineHint))
1589 return true;
1590
1591 // Hack to make compile times reasonable.
1592 if (InlineMaxBB) {
1593 // Single BB does not increase total BB amount.
1594 if (Callee->size() == 1)
1595 return true;
1596 size_t BBSize = Caller->size() + Callee->size() - 1;
1597 return BBSize <= InlineMaxBB;
1598 }
1599
1600 return true;
1601}
1602
1604 const SITargetLowering *TLI,
1605 const GCNTTIImpl *TTIImpl) {
1606 const int NrOfSGPRUntilSpill = 26;
1607 const int NrOfVGPRUntilSpill = 32;
1608
1609 const DataLayout &DL = TTIImpl->getDataLayout();
1610
1611 unsigned adjustThreshold = 0;
1612 int SGPRsInUse = 0;
1613 int VGPRsInUse = 0;
1614 for (const Use &A : CB->args()) {
1615 SmallVector<EVT, 4> ValueVTs;
1616 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1617 for (auto ArgVT : ValueVTs) {
1618 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1619 CB->getContext(), CB->getCallingConv(), ArgVT);
1621 SGPRsInUse += CCRegNum;
1622 else
1623 VGPRsInUse += CCRegNum;
1624 }
1625 }
1626
1627 // The cost of passing function arguments through the stack:
1628 // 1 instruction to put a function argument on the stack in the caller.
1629 // 1 instruction to take a function argument from the stack in callee.
1630 // 1 instruction is explicitly take care of data dependencies in callee
1631 // function.
1632 InstructionCost ArgStackCost(1);
1633 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1634 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1636 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1637 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1639
1640 // The penalty cost is computed relative to the cost of instructions and does
1641 // not model any storage costs.
1642 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1643 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1644 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1645 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1646 return adjustThreshold;
1647}
1648
1649static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1650 const DataLayout &DL) {
1651 // If we have a pointer to a private array passed into a function
1652 // it will not be optimized out, leaving scratch usage.
1653 // This function calculates the total size in bytes of the memory that would
1654 // end in scratch if the call was not inlined.
1655 unsigned AllocaSize = 0;
1657 for (Value *PtrArg : CB->args()) {
1658 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1659 if (!Ty)
1660 continue;
1661
1662 unsigned AddrSpace = Ty->getAddressSpace();
1663 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1664 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1665 continue;
1666
1668 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1669 continue;
1670
1671 if (auto Size = AI->getAllocationSize(DL))
1672 AllocaSize += Size->getFixedValue();
1673 }
1674 return AllocaSize;
1675}
1676
1681
1683 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
1684
1685 // Private object passed as arguments may end up in scratch usage if the call
1686 // is not inlined. Increase the inline threshold to promote inlining.
1687 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1688 if (AllocaSize > 0)
1689 Threshold += ArgAllocaCost;
1690 return Threshold;
1691}
1692
1694 const AllocaInst *AI) const {
1695
1696 // Below the cutoff, assume that the private memory objects would be
1697 // optimized
1698 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1699 if (AllocaSize <= ArgAllocaCutoff)
1700 return 0;
1701
1702 // Above the cutoff, we give a cost to each private memory object
1703 // depending its size. If the array can be optimized by SROA this cost is not
1704 // added to the total-cost in the inliner cost analysis.
1705 //
1706 // We choose the total cost of the alloca such that their sum cancels the
1707 // bonus given in the threshold (ArgAllocaCost).
1708 //
1709 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1710 //
1711 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1712 // the single-bb bonus and the vector-bonus.
1713 //
1714 // We compensate the first two multipliers, by repeating logic from the
1715 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1716 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1717 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1718
1719 bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
1720 return BB.getTerminator()->getNumSuccessors() > 1;
1721 });
1722 if (SingleBB) {
1723 Threshold += Threshold / 2;
1724 }
1725
1726 auto ArgAllocaSize = AI->getAllocationSize(DL);
1727 if (!ArgAllocaSize)
1728 return 0;
1729
1730 // Attribute the bonus proportionally to the alloca size
1731 unsigned AllocaThresholdBonus =
1732 (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize;
1733
1734 return AllocaThresholdBonus;
1735}
1736
1739 OptimizationRemarkEmitter *ORE) const {
1740 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1741}
1742
1744 TTI::PeelingPreferences &PP) const {
1745 CommonTTI.getPeelingPreferences(L, SE, PP);
1746}
1747
1748int GCNTTIImpl::getTransInstrCost(TTI::TargetCostKind CostKind) const {
1749 return getQuarterRateInstrCost(CostKind);
1750}
1751
1752int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1753 return ST->hasFullRate64Ops()
1754 ? getFullRateInstrCost()
1755 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1756 : getQuarterRateInstrCost(CostKind);
1757}
1758
1759std::pair<InstructionCost, MVT>
1760GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1761 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1762 auto Size = DL.getTypeSizeInBits(Ty);
1763 // Maximum load or store can handle 8 dwords for scalar and 4 for
1764 // vector ALU. Let's assume anything above 8 dwords is expensive
1765 // even if legal.
1766 if (Size <= 256)
1767 return Cost;
1768
1769 Cost.first += (Size + 255) / 256;
1770 return Cost;
1771}
1772
1774 return ST->hasPrefetch() ? 128 : 0;
1775}
1776
1779}
1780
1782 const Function &F,
1783 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1784 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1785 LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1786 LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1787 LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1788 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1789 ST->getFlatWorkGroupSizes(F);
1790 LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1791 LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1792 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1793 LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1794 LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1795}
1796
1799 if (!ST->hasFeature(AMDGPU::FeatureDX10ClampAndIEEEMode))
1800 return KnownIEEEMode::On; // Only mode on gfx1170+
1801
1802 const Function *F = I.getFunction();
1803 if (!F)
1805
1806 Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");
1807 if (IEEEAttr.isValid())
1809
1810 return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
1812}
1813
1815 Align Alignment,
1816 unsigned AddressSpace,
1818 TTI::OperandValueInfo OpInfo,
1819 const Instruction *I) const {
1820 if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1821 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1823 VecTy->getElementType()->isIntegerTy(8)) {
1824 return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
1826 }
1827 }
1828 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1829 OpInfo, I);
1830}
1831
1833 if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1834 if (VecTy->getElementType()->isIntegerTy(8)) {
1835 unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1836 return divideCeil(ElementCount - 1, 4);
1837 }
1838 }
1839 return BaseT::getNumberOfParts(Tp);
1840}
1841
1844 switch (Intrinsic->getIntrinsicID()) {
1845 case Intrinsic::amdgcn_wave_shuffle:
1847 default:
1848 break;
1849 }
1850 }
1851
1852 if (isAlwaysUniform(V))
1854
1855 if (isSourceOfDivergence(V))
1857
1859}
1860
1862 StackOffset BaseOffset,
1863 bool HasBaseReg, int64_t Scale,
1864 unsigned AddrSpace) const {
1865 if (HasBaseReg && Scale != 0) {
1866 // gfx1250+ can fold base+scale*index when scale matches the memory access
1867 // size (scale_offset bit). Supported for flat/global/constant/scratch
1868 // (VMEM, max 128 bits) and constant_32bit (SMRD, capped to 128 bits here).
1869 if (getST()->hasScaleOffset() && Ty && Ty->isSized() &&
1871 AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
1872 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)) {
1873 TypeSize StoreSize = getDataLayout().getTypeStoreSize(Ty);
1874 if (TypeSize::isKnownLE(StoreSize, TypeSize::getFixed(16)) &&
1875 static_cast<int64_t>(StoreSize.getFixedValue()) == Scale)
1876 return 0;
1877 }
1878 return 1;
1879 }
1880 return BaseT::getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
1881 AddrSpace);
1882}
1883
1885 const TTI::LSRCost &B) const {
1886 // Favor lower per-iteration work over preheader/setup costs.
1887 // AMDGPU lacks rich addressing modes, so ScaleCost is folded into the
1888 // effective instruction count (base+scale*index requires a separate ADD).
1889 unsigned EffInsnsA = A.Insns + A.ScaleCost;
1890 unsigned EffInsnsB = B.Insns + B.ScaleCost;
1891
1892 return std::tie(EffInsnsA, A.NumIVMuls, A.AddRecCost, A.NumBaseAdds,
1893 A.SetupCost, A.ImmCost, A.NumRegs) <
1894 std::tie(EffInsnsB, B.NumIVMuls, B.AddRecCost, B.NumBaseAdds,
1895 B.SetupCost, B.ImmCost, B.NumRegs);
1896}
1897
1899 // isLSRCostLess de-prioritizes register count; keep consistent.
1900 return false;
1901}
1902
1904 // Prefer the baseline when LSR cannot clearly reduce per-iteration work.
1905 return true;
1906}
1907
1909 const SmallBitVector &UniformArgs) const {
1911 switch (Intrinsic->getIntrinsicID()) {
1912 case Intrinsic::amdgcn_wave_shuffle:
1913 // wave_shuffle(Value, Index): result is uniform when either Value or Index
1914 // is uniform.
1915 return UniformArgs[0] || UniformArgs[1];
1916 default:
1917 llvm_unreachable("unexpected intrinsic in isUniform");
1918 }
1919}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static cl::opt< unsigned > MemcpyLoopUnroll("amdgpu-memcpy-loop-unroll", cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " "operations when lowering statically-sized memcpy, memmove, or" "memset as a loop"), cl::init(16), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, const DataLayout &DL)
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static unsigned getNumElements(Type *Ty)
This file implements the SmallBitVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:119
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition Attributes.h:261
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
unsigned getNumberOfParts(Type *Tp) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool isInlineAsm() const
Check if this call is an inline asm statement.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
CallingConv::ID getCallingConv() const
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
This class represents a function call, abstracting a target machine's calling convention.
Conditional Branch instruction.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:579
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
ArrayRef< unsigned > getIndices() const
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool approxFunc() const
Definition FMF.h:70
Container class for subtarget features.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:869
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Account for loads of i8 vector types to have reduced cost.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
bool isUniform(const Instruction *I, const SmallBitVector &UniformArgs) const override
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
unsigned getNumberOfRegisters(unsigned RCID) const override
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool isLSRCostLess(const TTI::LSRCost &A, const TTI::LSRCost &B) const override
bool shouldPrefetchAddressSpace(unsigned AS) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool hasBranchDivergence(const Function *F=nullptr) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
unsigned getInliningThresholdMultiplier() const override
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
unsigned adjustInliningThreshold(const CallBase *CB) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool shouldDropLSRSolutionIfLessProfitable() const override
int getInliningLastCallToStaticBonus() const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
ValueUniformity getValueUniformity(const Value *V) const override
unsigned getNumberOfParts(Type *Tp) const override
When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.
bool preferSLPInstCountCheck() const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
unsigned getMinVectorRegisterBitWidth() const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override
bool isNumRegsMajorCostOfLSR() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
LLVM_ABI bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Metadata node.
Definition Metadata.h:1069
Machine Value Type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
The optimization diagnostic interface.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
std::vector< AsmOperandInfo > AsmOperandInfoVector
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:310
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:308
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
Base class of all SIMD vector types.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
LLVM_ABI int getInstrCost()
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
auto m_Value()
Match an arbitrary value and ignore it.
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract_or_null(Y &&MD)
Extract a Value from Metadata, allowing null.
Definition Metadata.h:683
This is an optimization pass for GlobalISel generic memory operations.
@ Length
Definition DWP.cpp:558
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
InstructionCost Cost
LLVM_ABI void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ FAdd
Sum of floats.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
ValueUniformity
Enum describing how values behave with respect to uniformity and divergence, to answer the question: ...
Definition Uniformity.h:18
@ AlwaysUniform
The result value is always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result value can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
@ Custom
The result value requires a custom uniformity check.
Definition Uniformity.h:31
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Parameters that control the generic loop unrolling transformation.
unsigned Threshold
The cost threshold for the unrolled loop.
bool UnrollVectorizedLoop
Disable runtime unrolling by default for vectorized loops.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...