LLVM 23.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
18#include "AMDGPUSubtarget.h"
19#include "AMDGPUTargetMachine.h"
27#include "llvm/IR/Function.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include <optional>
33
34using namespace llvm;
35
36#define DEBUG_TYPE "AMDGPUtti"
37
39 "amdgpu-unroll-threshold-private",
40 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
41 cl::init(2700), cl::Hidden);
42
44 "amdgpu-unroll-threshold-local",
45 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
46 cl::init(1000), cl::Hidden);
47
49 "amdgpu-unroll-threshold-if",
50 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
51 cl::init(200), cl::Hidden);
52
54 "amdgpu-unroll-runtime-local",
55 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
56 cl::init(true), cl::Hidden);
57
59 "amdgpu-unroll-max-block-to-analyze",
60 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
61 cl::init(32), cl::Hidden);
62
63static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
64 cl::Hidden, cl::init(4000),
65 cl::desc("Cost of alloca argument"));
66
67// If the amount of scratch memory to eliminate exceeds our ability to allocate
68// it into registers we gain nothing by aggressively inlining functions for that
69// heuristic.
71 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
72 cl::init(256),
73 cl::desc("Maximum alloca size to use for inline cost"));
74
75// Inliner constraint to achieve reasonable compilation time.
77 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
78 cl::desc("Maximum number of BBs allowed in a function after inlining"
79 " (compile time constraint)"));
80
81// This default unroll factor is based on microbenchmarks on gfx1030.
83 "amdgpu-memcpy-loop-unroll",
84 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
85 "operations when lowering statically-sized memcpy, memmove, or"
86 "memset as a loop"),
87 cl::init(16), cl::Hidden);
88
89static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
90 unsigned Depth = 0) {
92 if (!I)
93 return false;
94
95 if (!L->contains(I))
96 return false;
97 for (const Value *V : I->operand_values()) {
98 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
99 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
100 return SubLoop->contains(PHI); }))
101 return true;
102 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
103 return true;
104 }
105 return false;
106}
107
109 : BaseT(TM, F.getDataLayout()),
110 TargetTriple(TM->getTargetTriple()),
111 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
112 TLI(ST->getTargetLowering()) {}
113
116 OptimizationRemarkEmitter *ORE) const {
117 const Function &F = *L->getHeader()->getParent();
118 UP.Threshold =
119 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
120 UP.MaxCount = std::numeric_limits<unsigned>::max();
121 UP.Partial = true;
122
123 // Conditional branch in a loop back edge needs 3 additional exec
124 // manipulations in average.
125 UP.BEInsns += 3;
126
127 // We want to run unroll even for the loops which have been vectorized.
128 UP.UnrollVectorizedLoop = true;
129
130 // Enable runtime unrolling for loops whose trip count is not known at
131 // compile time.
132 UP.Runtime = true;
133
134 // Maximum alloca size than can fit registers. Reserve 16 registers.
135 const unsigned MaxAlloca = (256 - 16) * 4;
136 unsigned ThresholdPrivate = UnrollThresholdPrivate;
137 unsigned ThresholdLocal = UnrollThresholdLocal;
138
139 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
140 // provided threshold value as the default for Threshold
141 if (MDNode *LoopUnrollThreshold =
142 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
143 if (LoopUnrollThreshold->getNumOperands() == 2) {
145 LoopUnrollThreshold->getOperand(1));
146 if (MetaThresholdValue) {
147 // We will also use the supplied value for PartialThreshold for now.
148 // We may introduce additional metadata if it becomes necessary in the
149 // future.
150 UP.Threshold = MetaThresholdValue->getSExtValue();
152 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
153 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
154 }
155 }
156 }
157
158 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
159 for (const BasicBlock *BB : L->getBlocks()) {
160 const DataLayout &DL = BB->getDataLayout();
161 unsigned LocalGEPsSeen = 0;
162
163 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
164 return SubLoop->contains(BB); }))
165 continue; // Block belongs to an inner loop.
166
167 for (const Instruction &I : *BB) {
168 // Unroll a loop which contains an "if" statement whose condition
169 // defined by a PHI belonging to the loop. This may help to eliminate
170 // if region and potentially even PHI itself, saving on both divergence
171 // and registers used for the PHI.
172 // Add a small bonus for each of such "if" statements.
173 if (const CondBrInst *Br = dyn_cast<CondBrInst>(&I)) {
174 if (UP.Threshold < MaxBoost) {
175 BasicBlock *Succ0 = Br->getSuccessor(0);
176 BasicBlock *Succ1 = Br->getSuccessor(1);
177 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
178 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
179 continue;
180 if (dependsOnLocalPhi(L, Br->getCondition())) {
182 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
183 << " for loop:\n"
184 << *L << " due to " << *Br << '\n');
185 if (UP.Threshold >= MaxBoost)
186 return;
187 }
188 }
189 continue;
190 }
191
193 if (!GEP)
194 continue;
195
196 unsigned AS = GEP->getAddressSpace();
197 unsigned Threshold = 0;
199 Threshold = ThresholdPrivate;
201 Threshold = ThresholdLocal;
202 else
203 continue;
204
205 if (UP.Threshold >= Threshold)
206 continue;
207
208 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
209 const Value *Ptr = GEP->getPointerOperand();
210 const AllocaInst *Alloca =
212 if (!Alloca || !Alloca->isStaticAlloca())
213 continue;
214 auto AllocaSize = Alloca->getAllocationSize(DL);
215 if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca)
216 continue;
217 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
219 LocalGEPsSeen++;
220 // Inhibit unroll for local memory if we have seen addressing not to
221 // a variable, most likely we will be unable to combine it.
222 // Do not unroll too deep inner loops for local memory to give a chance
223 // to unroll an outer loop for a more important reason.
224 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
225 (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
226 !isa<Argument>(GEP->getPointerOperand())))
227 continue;
228 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
229 << *L << " due to LDS use.\n");
231 }
232
233 // Check if GEP depends on a value defined by this loop itself.
234 bool HasLoopDef = false;
235 for (const Value *Op : GEP->operands()) {
236 const Instruction *Inst = dyn_cast<Instruction>(Op);
237 if (!Inst || L->isLoopInvariant(Op))
238 continue;
239
240 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
241 return SubLoop->contains(Inst); }))
242 continue;
243 HasLoopDef = true;
244 break;
245 }
246 if (!HasLoopDef)
247 continue;
248
249 // We want to do whatever we can to limit the number of alloca
250 // instructions that make it through to the code generator. allocas
251 // require us to use indirect addressing, which is slow and prone to
252 // compiler bugs. If this loop does an address calculation on an
253 // alloca ptr, then we want to use a higher than normal loop unroll
254 // threshold. This will give SROA a better chance to eliminate these
255 // allocas.
256 //
257 // We also want to have more unrolling for local memory to let ds
258 // instructions with different offsets combine.
259 //
260 // Don't use the maximum allowed value here as it will make some
261 // programs way too big.
262 UP.Threshold = Threshold;
263 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
264 << " for loop:\n"
265 << *L << " due to " << *GEP << '\n');
266 if (UP.Threshold >= MaxBoost)
267 return;
268 }
269
270 // If we got a GEP in a small BB from inner loop then increase max trip
271 // count to analyze for better estimation cost in unroll
272 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
274 }
275}
276
281
285
286const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
287 // Codegen control options which don't matter.
288 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
289 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal,
290 AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureUnalignedAccessMode,
291
292 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
293
294 // Property of the kernel/environment which can't actually differ.
295 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
296 AMDGPU::FeatureTrapHandler,
297
298 // The default assumption needs to be ecc is enabled, but no directly
299 // exposed operations depend on it, so it can be safely inlined.
300 AMDGPU::FeatureSRAMECC,
301
302 // Perf-tuning features
303 AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops};
304
306 : BaseT(TM, F.getDataLayout()),
307 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
308 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
309 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
311 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
312 HasFP64FP16Denormals =
313 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
314}
315
317 return !F || !ST->isSingleLaneExecution(*F);
318}
319
320unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
321 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
322 // registers. See getRegisterClassForType for the implementation.
323 // In this case vector registers are not vector in terms of
324 // VGPRs, but those which can hold multiple values.
325
326 // This is really the number of registers to fill when vectorizing /
327 // interleaving loops, so we lie to avoid trying to use all registers.
328 return 4;
329}
330
333 switch (K) {
335 return TypeSize::getFixed(32);
337 return TypeSize::getFixed((ST->hasPackedFP64Ops() || ST->hasPackedU64Ops())
338 ? 128
339 : ST->hasPackedFP32Ops() ? 64
340 : 32);
342 return TypeSize::getScalable(0);
343 }
344 llvm_unreachable("Unsupported register kind");
345}
346
348 return 32;
349}
350
351unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
352 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
353 return 32 * 4 / ElemWidth;
354 // For a given width return the max 0number of elements that can be combined
355 // into a wider bit value:
356 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
357 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
358 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
359 : (ElemWidth == 64 &&
360 (ST->hasPackedFP64Ops() || ST->hasPackedU64Ops()))
361 ? 2
362 : 1;
363}
364
366 // The integer inst-count heuristic causes regressions on gfx94x and gfx950
367 // because 2-element vector trees that pass the scalar/vector instruction
368 // count comparison still widen scalar moves (e.g. v_mov_b32 to v_mov_b64)
369 // after codegen, increasing register pressure and throughput cost without
370 // reducing the total instruction count.
371 return !ST->hasGFX940Insts() && !ST->hasGFX950Insts();
372}
373
374unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
375 unsigned ChainSizeInBytes,
376 VectorType *VecTy) const {
377 unsigned VecRegBitWidth = VF * LoadSize;
378 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
379 // TODO: Support element-size less than 32bit?
380 return 128 / LoadSize;
381
382 return VF;
383}
384
385unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
386 unsigned ChainSizeInBytes,
387 VectorType *VecTy) const {
388 unsigned VecRegBitWidth = VF * StoreSize;
389 if (VecRegBitWidth > 128)
390 return 128 / StoreSize;
391
392 return VF;
393}
394
395unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
396 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
397 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
399 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
400 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
402 return 512;
403 }
404
405 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
406 return 8 * ST->getMaxPrivateElementSize();
407
408 // Common to flat, global, local and region. Assume for unknown addrspace.
409 return 128;
410}
411
412bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
413 Align Alignment,
414 unsigned AddrSpace) const {
415 // We allow vectorization of flat stores, even though we may need to decompose
416 // them later if they may access private memory. We don't have enough context
417 // here, and legalization can handle it.
418 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
419 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
420 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
421 }
422 return true;
423}
424
425bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
426 Align Alignment,
427 unsigned AddrSpace) const {
428 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
429}
430
431bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
432 Align Alignment,
433 unsigned AddrSpace) const {
434 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
435}
436
440
442 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
443 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
444 std::optional<uint32_t> AtomicElementSize) const {
445
446 if (AtomicElementSize)
447 return Type::getIntNTy(Context, *AtomicElementSize * 8);
448
449 // 16-byte accesses achieve the highest copy throughput.
450 // If the operation has a fixed known length that is large enough, it is
451 // worthwhile to return an even wider type and let legalization lower it into
452 // multiple accesses, effectively unrolling the memcpy loop.
453 // We also rely on legalization to decompose into smaller accesses for
454 // subtargets and address spaces where it is necessary.
455 //
456 // Don't unroll if Length is not a constant, since unrolling leads to worse
457 // performance for length values that are smaller or slightly larger than the
458 // total size of the type returned here. Mitigating that would require a more
459 // complex lowering for variable-length memcpy and memmove.
460 unsigned I32EltsInVector = 4;
463 MemcpyLoopUnroll * I32EltsInVector);
464
465 return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector);
466}
467
469 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
470 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
471 Align SrcAlign, Align DestAlign,
472 std::optional<uint32_t> AtomicCpySize) const {
473
474 if (AtomicCpySize)
476 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
477 DestAlign, AtomicCpySize);
478
479 Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
480 while (RemainingBytes >= 16) {
481 OpsOut.push_back(I32x4Ty);
482 RemainingBytes -= 16;
483 }
484
485 Type *I64Ty = Type::getInt64Ty(Context);
486 while (RemainingBytes >= 8) {
487 OpsOut.push_back(I64Ty);
488 RemainingBytes -= 8;
489 }
490
491 Type *I32Ty = Type::getInt32Ty(Context);
492 while (RemainingBytes >= 4) {
493 OpsOut.push_back(I32Ty);
494 RemainingBytes -= 4;
495 }
496
497 Type *I16Ty = Type::getInt16Ty(Context);
498 while (RemainingBytes >= 2) {
499 OpsOut.push_back(I16Ty);
500 RemainingBytes -= 2;
501 }
502
503 Type *I8Ty = Type::getInt8Ty(Context);
504 while (RemainingBytes) {
505 OpsOut.push_back(I8Ty);
506 --RemainingBytes;
507 }
508}
509
511 // Disable unrolling if the loop is not vectorized.
512 // TODO: Enable this again.
513 if (VF.isScalar())
514 return 1;
515
516 return 8;
517}
518
520 MemIntrinsicInfo &Info) const {
521 switch (Inst->getIntrinsicID()) {
522 case Intrinsic::amdgcn_ds_ordered_add:
523 case Intrinsic::amdgcn_ds_ordered_swap: {
524 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
525 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
526 if (!Ordering || !Volatile)
527 return false; // Invalid.
528
529 unsigned OrderingVal = Ordering->getZExtValue();
530 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
531 return false;
532
533 Info.PtrVal = Inst->getArgOperand(0);
534 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
535 Info.ReadMem = true;
536 Info.WriteMem = true;
537 Info.IsVolatile = !Volatile->isZero();
538 return true;
539 }
540 default:
541 return false;
542 }
543}
544
546 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
548 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
549
550 // Legalize the type.
551 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
552 int ISD = TLI->InstructionOpcodeToISD(Opcode);
553
554 // Because we don't have any legal vector operations, but the legal types, we
555 // need to account for split vectors.
556 unsigned NElts = LT.second.isVector() ?
557 LT.second.getVectorNumElements() : 1;
558
559 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
560
561 switch (ISD) {
562 case ISD::SHL:
563 case ISD::SRL:
564 case ISD::SRA:
565 if (SLT == MVT::i64)
566 return get64BitInstrCost(CostKind) * LT.first * NElts;
567
568 if (ST->has16BitInsts() && SLT == MVT::i16)
569 NElts = (NElts + 1) / 2;
570
571 // i32
572 return getFullRateInstrCost() * LT.first * NElts;
573 case ISD::ADD:
574 case ISD::SUB:
575 if (SLT == MVT::i64 && ST->hasPackedU64Ops())
576 NElts = (NElts + 1) / 2;
577 [[fallthrough]];
578 case ISD::AND:
579 case ISD::OR:
580 case ISD::XOR:
581 if (SLT == MVT::i64) {
582 // and, or and xor are typically split into 2 VALU instructions.
583 return 2 * getFullRateInstrCost() * LT.first * NElts;
584 }
585
586 if (ST->has16BitInsts() && SLT == MVT::i16)
587 NElts = (NElts + 1) / 2;
588
589 return LT.first * NElts * getFullRateInstrCost();
590 case ISD::MUL: {
591 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
592 if (SLT == MVT::i64) {
593 const int FullRateCost = getFullRateInstrCost();
594 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
595 }
596
597 if (ST->has16BitInsts() && SLT == MVT::i16)
598 NElts = (NElts + 1) / 2;
599
600 // i32
601 return QuarterRateCost * NElts * LT.first;
602 }
603 case ISD::FMUL:
604 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
605 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
606 // fused operation.
607 if (CxtI && CxtI->hasOneUse())
608 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
609 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
610 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
611 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
613 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
615
616 // Estimate all types may be fused with contract/unsafe flags
617 const TargetOptions &Options = TLI->getTargetMachine().Options;
618 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
619 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
621 }
622 }
623 [[fallthrough]];
624 case ISD::FADD:
625 case ISD::FSUB:
626 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
627 NElts = (NElts + 1) / 2;
628 if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
629 NElts = (NElts + 1) / 2;
630 if (SLT == MVT::f64) {
631 if (ST->hasPackedFP64Ops())
632 NElts = (NElts + 1) / 2;
633 return LT.first * NElts * get64BitInstrCost(CostKind);
634 }
635
636 if (ST->has16BitInsts() && SLT == MVT::f16)
637 NElts = (NElts + 1) / 2;
638
639 if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
640 return LT.first * NElts * getFullRateInstrCost();
641 break;
642 case ISD::FDIV:
643 case ISD::FREM:
644 // FIXME: frem should be handled separately. The fdiv in it is most of it,
645 // but the current lowering is also not entirely correct.
646 if (SLT == MVT::f64) {
647 int Cost = 7 * get64BitInstrCost(CostKind) +
648 getQuarterRateInstrCost(CostKind) +
649 3 * getHalfRateInstrCost(CostKind);
650 // Add cost of workaround.
651 if (!ST->hasUsableDivScaleConditionOutput())
652 Cost += 3 * getFullRateInstrCost();
653
654 return LT.first * Cost * NElts;
655 }
656
657 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
658 // TODO: This is more complicated, unsafe flags etc.
659 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
660 (SLT == MVT::f16 && ST->has16BitInsts())) {
661 return LT.first * getTransInstrCost(CostKind) * NElts;
662 }
663 }
664
665 if (SLT == MVT::f16 && ST->has16BitInsts()) {
666 // 2 x v_cvt_f32_f16
667 // f32 rcp
668 // f32 fmul
669 // v_cvt_f16_f32
670 // f16 div_fixup
671 int Cost = 4 * getFullRateInstrCost() + 2 * getTransInstrCost(CostKind);
672 return LT.first * Cost * NElts;
673 }
674
675 if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
676 // Fast unsafe fdiv lowering:
677 // f32 rcp
678 // f32 fmul
679 int Cost = getTransInstrCost(CostKind) + getFullRateInstrCost();
680 return LT.first * Cost * NElts;
681 }
682
683 if (SLT == MVT::f32 || SLT == MVT::f16) {
684 // 4 more v_cvt_* insts without f16 insts support
685 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
686 1 * getTransInstrCost(CostKind);
687
688 if (!HasFP32Denormals) {
689 // FP mode switches.
690 Cost += 2 * getFullRateInstrCost();
691 }
692
693 return LT.first * NElts * Cost;
694 }
695 break;
696 case ISD::FNEG:
697 // Use the backend' estimation. If fneg is not free each element will cost
698 // one additional instruction.
699 return TLI->isFNegFree(SLT) ? 0 : NElts;
700 default:
701 break;
702 }
703
704 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
705 Args, CxtI);
706}
707
708// Return true if there's a potential benefit from using v2f16/v2i16
709// instructions for an intrinsic, even if it requires nontrivial legalization.
711 switch (ID) {
712 case Intrinsic::fma:
713 case Intrinsic::fmuladd:
714 case Intrinsic::copysign:
715 case Intrinsic::minimumnum:
716 case Intrinsic::maximumnum:
717 case Intrinsic::canonicalize:
718 // There's a small benefit to using vector ops in the legalized code.
719 case Intrinsic::round:
720 case Intrinsic::uadd_sat:
721 case Intrinsic::usub_sat:
722 case Intrinsic::sadd_sat:
723 case Intrinsic::ssub_sat:
724 case Intrinsic::abs:
725 return true;
726 default:
727 return false;
728 }
729}
730
734 switch (ICA.getID()) {
735 case Intrinsic::fabs:
736 // Free source modifier in the common case.
737 return 0;
738 case Intrinsic::amdgcn_workitem_id_x:
739 case Intrinsic::amdgcn_workitem_id_y:
740 case Intrinsic::amdgcn_workitem_id_z:
741 // TODO: If hasPackedTID, or if the calling context is not an entry point
742 // there may be a bit instruction.
743 return 0;
744 case Intrinsic::amdgcn_workgroup_id_x:
745 case Intrinsic::amdgcn_workgroup_id_y:
746 case Intrinsic::amdgcn_workgroup_id_z:
747 case Intrinsic::amdgcn_lds_kernel_id:
748 case Intrinsic::amdgcn_dispatch_ptr:
749 case Intrinsic::amdgcn_dispatch_id:
750 case Intrinsic::amdgcn_implicitarg_ptr:
751 case Intrinsic::amdgcn_queue_ptr:
752 // Read from an argument register.
753 return 0;
754 default:
755 break;
756 }
757
758 Type *RetTy = ICA.getReturnType();
759
760 Intrinsic::ID IID = ICA.getID();
761 switch (IID) {
762 case Intrinsic::exp:
763 case Intrinsic::exp2:
764 case Intrinsic::exp10: {
765 // Legalize the type.
766 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
767 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
768 unsigned NElts =
769 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
770
771 if (SLT == MVT::f64) {
772 unsigned NumOps = 20;
773 if (IID == Intrinsic::exp)
774 ++NumOps;
775 else if (IID == Intrinsic::exp10)
776 NumOps += 3;
777
778 return LT.first * NElts * NumOps * get64BitInstrCost(CostKind);
779 }
780
781 if (SLT == MVT::f32) {
782 unsigned NumFullRateOps = 0;
783 // v_exp_f32 (transcendental).
784 unsigned NumTransOps = 1;
785
786 if (!ICA.getFlags().approxFunc() && IID != Intrinsic::exp2) {
787 // Non-AFN exp/exp10: range reduction + v_exp_f32 + ldexp +
788 // overflow/underflow checks (lowerFEXP). Denorm is also handled.
789 // FMA preamble: ~13 full-rate ops; non-FMA: ~17.
790 NumFullRateOps = ST->hasFastFMAF32() ? 13 : 17;
791 } else {
792 if (IID == Intrinsic::exp) {
793 // lowerFEXPUnsafe: fmul (base conversion) + v_exp_f32.
794 NumFullRateOps = 1;
795 } else if (IID == Intrinsic::exp10) {
796 // lowerFEXP10Unsafe: 3 fmul + 2 v_exp_f32 (double-exp2).
797 NumFullRateOps = 3;
798 NumTransOps = 2;
799 }
800 // Denorm scaling adds setcc + select + fadd + select + fmul.
801 if (HasFP32Denormals)
802 NumFullRateOps += 5;
803 }
804
805 InstructionCost Cost = NumFullRateOps * getFullRateInstrCost() +
806 NumTransOps * getTransInstrCost(CostKind);
807 return LT.first * NElts * Cost;
808 }
809
810 break;
811 }
812 case Intrinsic::log:
813 case Intrinsic::log2:
814 case Intrinsic::log10: {
815 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
816 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
817 unsigned NElts =
818 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
819
820 if (SLT == MVT::f32) {
821 unsigned NumFullRateOps = 0;
822
823 if (IID == Intrinsic::log2) {
824 // LowerFLOG2: just v_log_f32.
825 } else if (ICA.getFlags().approxFunc()) {
826 // LowerFLOGUnsafe: v_log_f32 + fmul (base conversion).
827 NumFullRateOps = 1;
828 } else {
829 // LowerFLOGCommon non-AFN: v_log_f32 + extended-precision
830 // multiply + finite check.
831 NumFullRateOps = ST->hasFastFMAF32() ? 8 : 11;
832 }
833
834 if (HasFP32Denormals)
835 NumFullRateOps += 5;
836
838 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
839 return LT.first * NElts * Cost;
840 }
841
842 break;
843 }
844 case Intrinsic::sin:
845 case Intrinsic::cos: {
846 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
847 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
848 unsigned NElts =
849 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
850
851 if (SLT == MVT::f32) {
852 // LowerTrig: fmul(1/2pi) + v_sin/v_cos.
853 unsigned NumFullRateOps = ST->hasTrigReducedRange() ? 2 : 1;
854
856 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
857 return LT.first * NElts * Cost;
858 }
859
860 break;
861 }
862 case Intrinsic::sqrt: {
863 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
864 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
865 unsigned NElts =
866 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
867
868 if (SLT == MVT::f32) {
869 unsigned NumFullRateOps = 0;
870
871 if (!ICA.getFlags().approxFunc()) {
872 // lowerFSQRTF32 non-AFN: v_sqrt_f32 + refinement + scale fixup.
873 NumFullRateOps = HasFP32Denormals ? 17 : 16;
874 }
875
877 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
878 return LT.first * NElts * Cost;
879 }
880
881 break;
882 }
883 default:
884 break;
885 }
886
889
890 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
891 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
892 unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
893
894 if ((ST->hasVOP3PInsts() &&
895 (SLT == MVT::f16 || SLT == MVT::i16 ||
896 (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
897 (ST->hasPackedFP32Ops() && SLT == MVT::f32) ||
898 (ST->hasPackedFP64Ops() && SLT == MVT::f64) ||
899 (ST->hasPackedU64Ops() && SLT == MVT::i64))
900 NElts = (NElts + 1) / 2;
901
902 // TODO: Get more refined intrinsic costs?
903 unsigned InstRate = getQuarterRateInstrCost(CostKind);
904
905 switch (ICA.getID()) {
906 case Intrinsic::fma:
907 case Intrinsic::fmuladd:
908 if (SLT == MVT::f64) {
909 InstRate = get64BitInstrCost(CostKind);
910 break;
911 }
912
913 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
914 InstRate = getFullRateInstrCost();
915 else {
916 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
917 : getQuarterRateInstrCost(CostKind);
918 }
919 break;
920 case Intrinsic::copysign:
921 return NElts * getFullRateInstrCost();
922 case Intrinsic::minimumnum:
923 case Intrinsic::maximumnum: {
924 // Instruction + 2 canonicalizes. For cases that need type promotion, we the
925 // promotion takes the place of the canonicalize.
926 unsigned NumOps = 3;
927 if (const IntrinsicInst *II = ICA.getInst()) {
928 // Directly legal with ieee=0
929 // TODO: Not directly legal with strictfp
931 NumOps = 1;
932 }
933
934 unsigned BaseRate =
935 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
936 InstRate = BaseRate * NumOps;
937 break;
938 }
939 case Intrinsic::canonicalize: {
940 InstRate =
941 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
942 break;
943 }
944 case Intrinsic::uadd_sat:
945 case Intrinsic::usub_sat:
946 case Intrinsic::sadd_sat:
947 case Intrinsic::ssub_sat: {
948 if (SLT == MVT::i16 || SLT == MVT::i32)
949 InstRate = getFullRateInstrCost();
950
951 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
952 if (any_of(ValidSatTys, equal_to(LT.second)))
953 NElts = 1;
954 break;
955 }
956 case Intrinsic::abs:
957 // Expansion takes 2 instructions for VALU
958 if (SLT == MVT::i16 || SLT == MVT::i32)
959 InstRate = 2 * getFullRateInstrCost();
960 break;
961 default:
962 break;
963 }
964
965 return LT.first * NElts * InstRate;
966}
967
970 const Instruction *I) const {
971 assert((I == nullptr || I->getOpcode() == Opcode) &&
972 "Opcode should reflect passed instruction.");
973 const bool SCost =
975 const int CBrCost = SCost ? 5 : 7;
976 switch (Opcode) {
977 case Instruction::UncondBr:
978 // Branch instruction takes about 4 slots on gfx900.
979 return SCost ? 1 : 4;
980 case Instruction::CondBr:
981 // Suppose conditional branch takes additional 3 exec manipulations
982 // instructions in average.
983 return CBrCost;
984 case Instruction::Switch: {
985 const auto *SI = dyn_cast_or_null<SwitchInst>(I);
986 // Each case (including default) takes 1 cmp + 1 cbr instructions in
987 // average.
988 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
989 }
990 case Instruction::Ret:
991 return SCost ? 1 : 10;
992 }
993 return BaseT::getCFInstrCost(Opcode, CostKind, I);
994}
995
998 std::optional<FastMathFlags> FMF,
1001 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1002
1003 EVT OrigTy = TLI->getValueType(DL, Ty);
1004
1005 // Computes cost on targets that have packed math instructions(which support
1006 // 16-bit types only).
1007 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
1008 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1009
1010 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1011 return LT.first * getFullRateInstrCost();
1012}
1013
1016 FastMathFlags FMF,
1018 EVT OrigTy = TLI->getValueType(DL, Ty);
1019
1020 // Computes cost on targets that have packed math instructions(which support
1021 // 16-bit types only).
1022 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
1023 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1024
1025 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1026 return LT.first * getHalfRateInstrCost(CostKind);
1027}
1028
1030 unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
1031 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
1032 switch (Opcode) {
1033 case Instruction::ExtractElement:
1034 case Instruction::InsertElement: {
1035 unsigned EltSize
1036 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
1037 // Dynamic indexing isn't free and is best avoided.
1038 if (Index == ~0u)
1039 return 2;
1040 if (EltSize < 32) {
1041 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
1042 return 0;
1043 // Some i8 inserts and extracts are free so we want to reduce the
1044 // cost to avoid scalarization. We limit the zero cost cases to avoid
1045 // adversely impacting all i8 vectorizing.
1046 if (EltSize == 8) {
1047 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1048 if (NumElts >= 4 && isPowerOf2_32(NumElts)) {
1049 // Extracts at indices aligned to 32-bit boundaries (0, 4, 8, 12 for
1050 // v16i8) are free as they access the low byte of each VGPR. Other
1051 // indices require bit manipulation (shifts/byte selects) and cost 1.
1052 return Index % 4 == 0 ? 0 : 1;
1053 }
1054 }
1055 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
1056 VIC);
1057 }
1058
1059 // Extracts are just reads of a subregister, so are free. Inserts are
1060 // considered free because we don't want to have any cost for scalarizing
1061 // operations, and we don't have to copy into a different register class.
1062 return 0;
1063 }
1064 default:
1065 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
1066 VIC);
1067 }
1068}
1069
1070/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
1071/// this is analyzing the collective result of all output registers. Otherwise,
1072/// this is only querying a specific result index if this returns multiple
1073/// registers in a struct.
1075 const CallInst *CI, ArrayRef<unsigned> Indices) const {
1076 // TODO: Handle complex extract indices
1077 if (Indices.size() > 1)
1078 return true;
1079
1080 const DataLayout &DL = CI->getDataLayout();
1081 const SIRegisterInfo *TRI = ST->getRegisterInfo();
1082 TargetLowering::AsmOperandInfoVector TargetConstraints =
1083 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
1084
1085 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
1086
1087 int OutputIdx = 0;
1088 for (auto &TC : TargetConstraints) {
1089 if (TC.Type != InlineAsm::isOutput)
1090 continue;
1091
1092 // Skip outputs we don't care about.
1093 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
1094 continue;
1095
1096 TLI->ComputeConstraintToUse(TC, SDValue());
1097
1098 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
1099 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
1100
1101 // For AGPR constraints null is returned on subtargets without AGPRs, so
1102 // assume divergent for null.
1103 if (!RC || !TRI->isSGPRClass(RC))
1104 return true;
1105 }
1106
1107 return false;
1108}
1109
1111 const IntrinsicInst *ReadReg) const {
1112 Metadata *MD =
1113 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
1115 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
1116
1117 // Special case registers that look like VCC.
1118 MVT VT = MVT::getVT(ReadReg->getType());
1119 if (VT == MVT::i1)
1120 return true;
1121
1122 // Special case scalar registers that start with 'v'.
1123 if (RegName.starts_with("vcc") || RegName.empty())
1124 return false;
1125
1126 // VGPR or AGPR is divergent. There aren't any specially named vector
1127 // registers.
1128 return RegName[0] == 'v' || RegName[0] == 'a';
1129}
1130
1131/// \returns true if the result of the value could potentially be
1132/// different across workitems in a wavefront.
1133bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
1134 if (const Argument *A = dyn_cast<Argument>(V))
1136
1137 // Loads from the private and flat address spaces are divergent, because
1138 // threads can execute the load instruction with the same inputs and get
1139 // different results.
1140 //
1141 // All other loads are not divergent, because if threads issue loads with the
1142 // same arguments, they will always get the same result.
1143 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
1144 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
1145 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
1146
1147 // Atomics are divergent because they are executed sequentially: when an
1148 // atomic operation refers to the same address in each thread, then each
1149 // thread after the first sees the value written by the previous thread as
1150 // original value.
1152 return true;
1153
1155 Intrinsic::ID IID = Intrinsic->getIntrinsicID();
1156 switch (IID) {
1157 case Intrinsic::read_register:
1159 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1160 unsigned SrcAS =
1161 Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
1162 unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
1163 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
1164 DstAS == AMDGPUAS::FLAT_ADDRESS &&
1165 ST->hasGloballyAddressableScratch();
1166 }
1167 case Intrinsic::amdgcn_workitem_id_y:
1168 case Intrinsic::amdgcn_workitem_id_z: {
1169 const Function *F = Intrinsic->getFunction();
1170 bool HasUniformYZ =
1171 ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);
1172 std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
1173 *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1174 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1175 }
1176 default:
1178 }
1179 }
1180
1181 // Assume all function calls are a source of divergence.
1182 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1183 if (CI->isInlineAsm())
1185 return true;
1186 }
1187
1188 // Assume all function calls are a source of divergence.
1189 if (isa<InvokeInst>(V))
1190 return true;
1191
1192 // If the target supports globally addressable scratch, the mapping from
1193 // scratch memory to the flat aperture changes therefore an address space cast
1194 // is no longer uniform.
1195 if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {
1196 return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
1197 CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
1198 ST->hasGloballyAddressableScratch();
1199 }
1200
1201 return false;
1202}
1203
1204bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
1205 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
1206 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
1207
1208 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1209 if (CI->isInlineAsm())
1211 return false;
1212 }
1213
1214 // In most cases TID / wavefrontsize is uniform.
1215 //
1216 // However, if a kernel has uneven dimesions we can have a value of
1217 // workitem-id-x divided by the wavefrontsize non-uniform. For example
1218 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1219 // packed into a same wave which gives 1 and 0 after the division by 64
1220 // respectively.
1221 //
1222 // The X dimension doesn't reset within a wave if either both the Y
1223 // and Z dimensions are of length 1, or if the X dimension's required
1224 // size is a power of 2. Note, however, if the X dimension's maximum
1225 // size is a power of 2 < the wavefront size, division by the wavefront
1226 // size is guaranteed to yield 0, so this is also a no-reset case.
1227 bool XDimDoesntResetWithinWaves = false;
1228 if (auto *I = dyn_cast<Instruction>(V)) {
1229 const Function *F = I->getFunction();
1230 XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);
1231 }
1232 using namespace llvm::PatternMatch;
1233 uint64_t C;
1235 m_ConstantInt(C))) ||
1237 m_ConstantInt(C)))) {
1238 return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
1239 }
1240
1241 Value *Mask;
1243 m_Value(Mask)))) {
1244 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
1245 ST->getWavefrontSizeLog2() &&
1246 XDimDoesntResetWithinWaves;
1247 }
1248
1249 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
1250 if (!ExtValue)
1251 return false;
1252
1253 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
1254 if (!CI)
1255 return false;
1256
1257 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1258 switch (Intrinsic->getIntrinsicID()) {
1259 default:
1260 return false;
1261 case Intrinsic::amdgcn_if:
1262 case Intrinsic::amdgcn_else: {
1263 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1264 return Indices.size() == 1 && Indices[0] == 1;
1265 }
1266 }
1267 }
1268
1269 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1270 // divergent for the overall struct return. We need to override it in the
1271 // case we're extracting an SGPR component here.
1272 if (CI->isInlineAsm())
1273 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1274
1275 return false;
1276}
1277
1279 Intrinsic::ID IID) const {
1280 switch (IID) {
1281 case Intrinsic::amdgcn_is_shared:
1282 case Intrinsic::amdgcn_is_private:
1283 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1284 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1285 case Intrinsic::amdgcn_load_to_lds:
1286 case Intrinsic::amdgcn_make_buffer_rsrc:
1287 OpIndexes.push_back(0);
1288 return true;
1289 default:
1290 return false;
1291 }
1292}
1293
1295 Value *OldV,
1296 Value *NewV) const {
1297 auto IntrID = II->getIntrinsicID();
1298 switch (IntrID) {
1299 case Intrinsic::amdgcn_is_shared:
1300 case Intrinsic::amdgcn_is_private: {
1301 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1303 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1304 LLVMContext &Ctx = NewV->getType()->getContext();
1305 ConstantInt *NewVal = (TrueAS == NewAS) ?
1307 return NewVal;
1308 }
1309 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1310 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1311 Type *DestTy = II->getType();
1312 Type *SrcTy = NewV->getType();
1313 unsigned NewAS = SrcTy->getPointerAddressSpace();
1315 return nullptr;
1316 Module *M = II->getModule();
1318 M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
1319 II->setArgOperand(0, NewV);
1320 II->setCalledFunction(NewDecl);
1321 return II;
1322 }
1323 case Intrinsic::amdgcn_load_to_lds: {
1324 Type *SrcTy = NewV->getType();
1325 Module *M = II->getModule();
1326 Function *NewDecl =
1327 Intrinsic::getOrInsertDeclaration(M, II->getIntrinsicID(), {SrcTy});
1328 II->setArgOperand(0, NewV);
1329 II->setCalledFunction(NewDecl);
1330 return II;
1331 }
1332 case Intrinsic::amdgcn_make_buffer_rsrc: {
1333 Type *SrcTy = NewV->getType();
1334 Type *DstTy = II->getType();
1335 Module *M = II->getModule();
1337 M, II->getIntrinsicID(), {DstTy, SrcTy});
1338 II->setArgOperand(0, NewV);
1339 II->setCalledFunction(NewDecl);
1340 return II;
1341 }
1342 default:
1343 return nullptr;
1344 }
1345}
1346
1348 VectorType *DstTy, VectorType *SrcTy,
1349 ArrayRef<int> Mask,
1351 int Index, VectorType *SubTp,
1353 const Instruction *CxtI) const {
1354 if (!isa<FixedVectorType>(SrcTy))
1355 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1356 SubTp);
1357
1358 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1359
1360 unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());
1361 if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1362 (ScalarSize == 16 || ScalarSize == 8)) {
1363 // Larger vector widths may require additional instructions, but are
1364 // typically cheaper than scalarized versions.
1365 //
1366 // We assume that shuffling at a register granularity can be done for free.
1367 // This is not true for vectors fed into memory instructions, but it is
1368 // effectively true for all other shuffling. The emphasis of the logic here
1369 // is to assist generic transform in cleaning up / canonicalizing those
1370 // shuffles.
1371
1372 // With op_sel VOP3P instructions freely can access the low half or high
1373 // half of a register, so any swizzle of two elements is free.
1374 if (auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy)) {
1375 unsigned NumSrcElts = SrcVecTy->getNumElements();
1376 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
1377 (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
1378 Kind == TTI::SK_PermuteSingleSrc))
1379 return 0;
1380 }
1381
1382 unsigned EltsPerReg = 32 / ScalarSize;
1383 switch (Kind) {
1384 case TTI::SK_Broadcast:
1385 // A single v_perm_b32 can be re-used for all destination registers.
1386 return 1;
1387 case TTI::SK_Reverse:
1388 // One instruction per register.
1389 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1390 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1393 if (Index % EltsPerReg == 0)
1394 return 0; // Shuffling at register granularity
1395 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1396 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1399 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1400 if (!DstVecTy)
1402 unsigned NumDstElts = DstVecTy->getNumElements();
1403 unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements();
1404 unsigned EndIndex = Index + NumInsertElts;
1405 unsigned BeginSubIdx = Index % EltsPerReg;
1406 unsigned EndSubIdx = EndIndex % EltsPerReg;
1407 unsigned Cost = 0;
1408
1409 if (BeginSubIdx != 0) {
1410 // Need to shift the inserted vector into place. The cost is the number
1411 // of destination registers overlapped by the inserted vector.
1412 Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg);
1413 }
1414
1415 // If the last register overlap is partial, there may be three source
1416 // registers feeding into it; that takes an extra instruction.
1417 if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
1418 Cost += 1;
1419
1420 return Cost;
1421 }
1422 case TTI::SK_Splice: {
1423 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1424 if (!DstVecTy)
1426 unsigned NumElts = DstVecTy->getNumElements();
1427 assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
1428 // Determine the sub-region of the result vector that requires
1429 // sub-register shuffles / mixing.
1430 unsigned EltsFromLHS = NumElts - Index;
1431 bool LHSIsAligned = (Index % EltsPerReg) == 0;
1432 bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
1433 if (LHSIsAligned && RHSIsAligned)
1434 return 0;
1435 if (LHSIsAligned && !RHSIsAligned)
1436 return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);
1437 if (!LHSIsAligned && RHSIsAligned)
1438 return divideCeil(EltsFromLHS, EltsPerReg);
1439 return divideCeil(NumElts, EltsPerReg);
1440 }
1441 default:
1442 break;
1443 }
1444
1445 if (!Mask.empty()) {
1446 unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();
1447
1448 // Generically estimate the cost by assuming that each destination
1449 // register is derived from sources via v_perm_b32 instructions if it
1450 // can't be copied as-is.
1451 //
1452 // For each destination register, derive the cost of obtaining it based
1453 // on the number of source registers that feed into it.
1454 unsigned Cost = 0;
1455 for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
1457 bool Aligned = true;
1458 for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
1459 int SrcIdx = Mask[DstIdx + I];
1460 if (SrcIdx == -1)
1461 continue;
1462 int Reg;
1463 if (SrcIdx < (int)NumSrcElts) {
1464 Reg = SrcIdx / EltsPerReg;
1465 if (SrcIdx % EltsPerReg != I)
1466 Aligned = false;
1467 } else {
1468 Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
1469 if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
1470 Aligned = false;
1471 }
1472 if (!llvm::is_contained(Regs, Reg))
1473 Regs.push_back(Reg);
1474 }
1475 if (Regs.size() >= 2)
1476 Cost += Regs.size() - 1;
1477 else if (!Aligned)
1478 Cost += 1;
1479 }
1480 return Cost;
1481 }
1482 }
1483
1484 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1485 SubTp);
1486}
1487
1488/// Whether it is profitable to sink the operands of an
1489/// Instruction I to the basic block of I.
1490/// This helps using several modifiers (like abs and neg) more often.
1492 SmallVectorImpl<Use *> &Ops) const {
1493 using namespace PatternMatch;
1494
1495 for (auto &Op : I->operands()) {
1496 // Ensure we are not already sinking this operand.
1497 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
1498 continue;
1499
1500 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) {
1501 Ops.push_back(&Op);
1502 continue;
1503 }
1504
1505 // Check for zero-cost multiple use InsertElement/ExtractElement
1506 // instructions
1507 if (Instruction *OpInst = dyn_cast<Instruction>(Op.get())) {
1508 if (OpInst->getType()->isVectorTy() && OpInst->getNumOperands() > 1) {
1509 Instruction *VecOpInst = dyn_cast<Instruction>(OpInst->getOperand(0));
1510 if (VecOpInst && VecOpInst->hasOneUse())
1511 continue;
1512
1513 if (getVectorInstrCost(OpInst->getOpcode(), OpInst->getType(),
1515 OpInst->getOperand(0),
1516 OpInst->getOperand(1)) == 0) {
1517 Ops.push_back(&Op);
1518 continue;
1519 }
1520 }
1521 }
1522
1523 if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
1524
1525 unsigned EltSize = DL.getTypeSizeInBits(
1526 cast<VectorType>(Shuffle->getType())->getElementType());
1527
1528 // For i32 (or greater) shufflevectors, these will be lowered into a
1529 // series of insert / extract elements, which will be coalesced away.
1530 if (EltSize < 16 || !ST->has16BitInsts())
1531 continue;
1532
1533 int NumSubElts, SubIndex;
1534 if (Shuffle->changesLength()) {
1535 if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1536 Ops.push_back(&Op);
1537 continue;
1538 }
1539
1540 if ((Shuffle->isExtractSubvectorMask(SubIndex) ||
1541 Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) &&
1542 !(SubIndex & 0x1)) {
1543 Ops.push_back(&Op);
1544 continue;
1545 }
1546 }
1547
1548 if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1549 Shuffle->isSingleSource()) {
1550 Ops.push_back(&Op);
1551 continue;
1552 }
1553 }
1554 }
1555
1556 return !Ops.empty();
1557}
1558
1560 const Function *Callee) const {
1561 const TargetMachine &TM = getTLI()->getTargetMachine();
1562 const GCNSubtarget *CallerST
1563 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1564 const GCNSubtarget *CalleeST
1565 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1566
1567 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1568 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1569
1570 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1571 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1572 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1573 return false;
1574
1575 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1576 // no way to support merge for backend defined attributes.
1577 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1578 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1579 if (!CallerMode.isInlineCompatible(CalleeMode))
1580 return false;
1581
1582 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1583 Callee->hasFnAttribute(Attribute::InlineHint))
1584 return true;
1585
1586 // Hack to make compile times reasonable.
1587 if (InlineMaxBB) {
1588 // Single BB does not increase total BB amount.
1589 if (Callee->size() == 1)
1590 return true;
1591 size_t BBSize = Caller->size() + Callee->size() - 1;
1592 return BBSize <= InlineMaxBB;
1593 }
1594
1595 return true;
1596}
1597
1599 const SITargetLowering *TLI,
1600 const GCNTTIImpl *TTIImpl) {
1601 const int NrOfSGPRUntilSpill = 26;
1602 const int NrOfVGPRUntilSpill = 32;
1603
1604 const DataLayout &DL = TTIImpl->getDataLayout();
1605
1606 unsigned adjustThreshold = 0;
1607 int SGPRsInUse = 0;
1608 int VGPRsInUse = 0;
1609 for (const Use &A : CB->args()) {
1610 SmallVector<EVT, 4> ValueVTs;
1611 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1612 for (auto ArgVT : ValueVTs) {
1613 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1614 CB->getContext(), CB->getCallingConv(), ArgVT);
1616 SGPRsInUse += CCRegNum;
1617 else
1618 VGPRsInUse += CCRegNum;
1619 }
1620 }
1621
1622 // The cost of passing function arguments through the stack:
1623 // 1 instruction to put a function argument on the stack in the caller.
1624 // 1 instruction to take a function argument from the stack in callee.
1625 // 1 instruction is explicitly take care of data dependencies in callee
1626 // function.
1627 InstructionCost ArgStackCost(1);
1628 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1629 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1631 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1632 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1634
1635 // The penalty cost is computed relative to the cost of instructions and does
1636 // not model any storage costs.
1637 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1638 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1639 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1640 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1641 return adjustThreshold;
1642}
1643
1644static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1645 const DataLayout &DL) {
1646 // If we have a pointer to a private array passed into a function
1647 // it will not be optimized out, leaving scratch usage.
1648 // This function calculates the total size in bytes of the memory that would
1649 // end in scratch if the call was not inlined.
1650 unsigned AllocaSize = 0;
1652 for (Value *PtrArg : CB->args()) {
1653 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1654 if (!Ty)
1655 continue;
1656
1657 unsigned AddrSpace = Ty->getAddressSpace();
1658 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1659 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1660 continue;
1661
1663 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1664 continue;
1665
1666 if (auto Size = AI->getAllocationSize(DL))
1667 AllocaSize += Size->getFixedValue();
1668 }
1669 return AllocaSize;
1670}
1671
1676
1678 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
1679
1680 // Private object passed as arguments may end up in scratch usage if the call
1681 // is not inlined. Increase the inline threshold to promote inlining.
1682 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1683 if (AllocaSize > 0)
1684 Threshold += ArgAllocaCost;
1685 return Threshold;
1686}
1687
1689 const AllocaInst *AI) const {
1690
1691 // Below the cutoff, assume that the private memory objects would be
1692 // optimized
1693 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1694 if (AllocaSize <= ArgAllocaCutoff)
1695 return 0;
1696
1697 // Above the cutoff, we give a cost to each private memory object
1698 // depending its size. If the array can be optimized by SROA this cost is not
1699 // added to the total-cost in the inliner cost analysis.
1700 //
1701 // We choose the total cost of the alloca such that their sum cancels the
1702 // bonus given in the threshold (ArgAllocaCost).
1703 //
1704 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1705 //
1706 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1707 // the single-bb bonus and the vector-bonus.
1708 //
1709 // We compensate the first two multipliers, by repeating logic from the
1710 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1711 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1712 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1713
1714 bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
1715 return BB.getTerminator()->getNumSuccessors() > 1;
1716 });
1717 if (SingleBB) {
1718 Threshold += Threshold / 2;
1719 }
1720
1721 auto ArgAllocaSize = AI->getAllocationSize(DL);
1722 if (!ArgAllocaSize)
1723 return 0;
1724
1725 // Attribute the bonus proportionally to the alloca size
1726 unsigned AllocaThresholdBonus =
1727 (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize;
1728
1729 return AllocaThresholdBonus;
1730}
1731
1734 OptimizationRemarkEmitter *ORE) const {
1735 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1736}
1737
1739 TTI::PeelingPreferences &PP) const {
1740 CommonTTI.getPeelingPreferences(L, SE, PP);
1741}
1742
1743int GCNTTIImpl::getTransInstrCost(TTI::TargetCostKind CostKind) const {
1744 return getQuarterRateInstrCost(CostKind);
1745}
1746
1747int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1748 return ST->hasFullRate64Ops()
1749 ? getFullRateInstrCost()
1750 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1751 : getQuarterRateInstrCost(CostKind);
1752}
1753
1754std::pair<InstructionCost, MVT>
1755GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1756 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1757 auto Size = DL.getTypeSizeInBits(Ty);
1758 // Maximum load or store can handle 8 dwords for scalar and 4 for
1759 // vector ALU. Let's assume anything above 8 dwords is expensive
1760 // even if legal.
1761 if (Size <= 256)
1762 return Cost;
1763
1764 Cost.first += (Size + 255) / 256;
1765 return Cost;
1766}
1767
1769 return ST->hasPrefetch() ? 128 : 0;
1770}
1771
1774}
1775
1777 const Function &F,
1778 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1779 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1780 LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1781 LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1782 LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1783 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1784 ST->getFlatWorkGroupSizes(F);
1785 LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1786 LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1787 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1788 LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1789 LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1790}
1791
1794 if (!ST->hasFeature(AMDGPU::FeatureDX10ClampAndIEEEMode))
1795 return KnownIEEEMode::On; // Only mode on gfx1170+
1796
1797 const Function *F = I.getFunction();
1798 if (!F)
1800
1801 Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");
1802 if (IEEEAttr.isValid())
1804
1805 return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
1807}
1808
1810 Align Alignment,
1811 unsigned AddressSpace,
1813 TTI::OperandValueInfo OpInfo,
1814 const Instruction *I) const {
1815 if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1816 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1818 VecTy->getElementType()->isIntegerTy(8)) {
1819 return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
1821 }
1822 }
1823 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1824 OpInfo, I);
1825}
1826
1828 if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1829 if (VecTy->getElementType()->isIntegerTy(8)) {
1830 unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1831 return divideCeil(ElementCount - 1, 4);
1832 }
1833 }
1834 return BaseT::getNumberOfParts(Tp);
1835}
1836
1839 switch (Intrinsic->getIntrinsicID()) {
1840 case Intrinsic::amdgcn_wave_shuffle:
1842 default:
1843 break;
1844 }
1845 }
1846
1847 if (isAlwaysUniform(V))
1849
1850 if (isSourceOfDivergence(V))
1852
1854}
1855
1857 StackOffset BaseOffset,
1858 bool HasBaseReg, int64_t Scale,
1859 unsigned AddrSpace) const {
1860 if (HasBaseReg && Scale != 0) {
1861 // gfx1250+ can fold base+scale*index when scale matches the memory access
1862 // size (scale_offset bit). Supported for flat/global/constant/scratch
1863 // (VMEM, max 128 bits) and constant_32bit (SMRD, capped to 128 bits here).
1864 if (getST()->hasScaleOffset() && Ty && Ty->isSized() &&
1866 AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
1867 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)) {
1868 TypeSize StoreSize = getDataLayout().getTypeStoreSize(Ty);
1869 if (TypeSize::isKnownLE(StoreSize, TypeSize::getFixed(16)) &&
1870 static_cast<int64_t>(StoreSize.getFixedValue()) == Scale)
1871 return 0;
1872 }
1873 return 1;
1874 }
1875 return BaseT::getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
1876 AddrSpace);
1877}
1878
1880 const TTI::LSRCost &B) const {
1881 // Favor lower per-iteration work over preheader/setup costs.
1882 // AMDGPU lacks rich addressing modes, so ScaleCost is folded into the
1883 // effective instruction count (base+scale*index requires a separate ADD).
1884 unsigned EffInsnsA = A.Insns + A.ScaleCost;
1885 unsigned EffInsnsB = B.Insns + B.ScaleCost;
1886
1887 return std::tie(EffInsnsA, A.NumIVMuls, A.AddRecCost, A.NumBaseAdds,
1888 A.SetupCost, A.ImmCost, A.NumRegs) <
1889 std::tie(EffInsnsB, B.NumIVMuls, B.AddRecCost, B.NumBaseAdds,
1890 B.SetupCost, B.ImmCost, B.NumRegs);
1891}
1892
1894 // isLSRCostLess de-prioritizes register count; keep consistent.
1895 return false;
1896}
1897
1899 // Prefer the baseline when LSR cannot clearly reduce per-iteration work.
1900 return true;
1901}
1902
1904 const SmallBitVector &UniformArgs) const {
1906 switch (Intrinsic->getIntrinsicID()) {
1907 case Intrinsic::amdgcn_wave_shuffle:
1908 // wave_shuffle(Value, Index): result is uniform when either Value or Index
1909 // is uniform.
1910 return UniformArgs[0] || UniformArgs[1];
1911 default:
1912 llvm_unreachable("unexpected intrinsic in isUniform");
1913 }
1914}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static cl::opt< unsigned > MemcpyLoopUnroll("amdgpu-memcpy-loop-unroll", cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " "operations when lowering statically-sized memcpy, memmove, or" "memset as a loop"), cl::init(16), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, const DataLayout &DL)
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static unsigned getNumElements(Type *Ty)
This file implements the SmallBitVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:119
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition Attributes.h:261
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
unsigned getNumberOfParts(Type *Tp) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool isInlineAsm() const
Check if this call is an inline asm statement.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
CallingConv::ID getCallingConv() const
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
This class represents a function call, abstracting a target machine's calling convention.
Conditional Branch instruction.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:579
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
ArrayRef< unsigned > getIndices() const
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool approxFunc() const
Definition FMF.h:70
Container class for subtarget features.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:869
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Account for loads of i8 vector types to have reduced cost.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
bool isUniform(const Instruction *I, const SmallBitVector &UniformArgs) const override
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
unsigned getNumberOfRegisters(unsigned RCID) const override
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool isLSRCostLess(const TTI::LSRCost &A, const TTI::LSRCost &B) const override
bool shouldPrefetchAddressSpace(unsigned AS) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool hasBranchDivergence(const Function *F=nullptr) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
unsigned getInliningThresholdMultiplier() const override
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
unsigned adjustInliningThreshold(const CallBase *CB) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool shouldDropLSRSolutionIfLessProfitable() const override
int getInliningLastCallToStaticBonus() const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
ValueUniformity getValueUniformity(const Value *V) const override
unsigned getNumberOfParts(Type *Tp) const override
When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.
bool preferSLPInstCountCheck() const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
unsigned getMinVectorRegisterBitWidth() const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override
bool isNumRegsMajorCostOfLSR() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
LLVM_ABI bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Metadata node.
Definition Metadata.h:1069
Machine Value Type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
The optimization diagnostic interface.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
std::vector< AsmOperandInfo > AsmOperandInfoVector
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:310
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:308
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
Base class of all SIMD vector types.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
LLVM_ABI int getInstrCost()
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
auto m_Value()
Match an arbitrary value and ignore it.
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract_or_null(Y &&MD)
Extract a Value from Metadata, allowing null.
Definition Metadata.h:683
This is an optimization pass for GlobalISel generic memory operations.
@ Length
Definition DWP.cpp:558
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
InstructionCost Cost
LLVM_ABI void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2172
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1752
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ FAdd
Sum of floats.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
ValueUniformity
Enum describing how values behave with respect to uniformity and divergence, to answer the question: ...
Definition Uniformity.h:18
@ AlwaysUniform
The result value is always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result value can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
@ Custom
The result value requires a custom uniformity check.
Definition Uniformity.h:31
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Parameters that control the generic loop unrolling transformation.
unsigned Threshold
The cost threshold for the unrolled loop.
bool UnrollVectorizedLoop
Disable runtime unrolling by default for vectorized loops.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...