LLVM 23.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
18#include "AMDGPUSubtarget.h"
19#include "AMDGPUTargetMachine.h"
27#include "llvm/IR/Function.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
32#include <optional>
33
34using namespace llvm;
35
36#define DEBUG_TYPE "AMDGPUtti"
37
39 "amdgpu-unroll-threshold-private",
40 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
41 cl::init(2700), cl::Hidden);
42
44 "amdgpu-unroll-threshold-local",
45 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
46 cl::init(1000), cl::Hidden);
47
49 "amdgpu-unroll-threshold-if",
50 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
51 cl::init(200), cl::Hidden);
52
54 "amdgpu-unroll-runtime-local",
55 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
56 cl::init(true), cl::Hidden);
57
59 "amdgpu-unroll-max-block-to-analyze",
60 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
61 cl::init(32), cl::Hidden);
62
63static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
64 cl::Hidden, cl::init(4000),
65 cl::desc("Cost of alloca argument"));
66
67// If the amount of scratch memory to eliminate exceeds our ability to allocate
68// it into registers we gain nothing by aggressively inlining functions for that
69// heuristic.
71 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
72 cl::init(256),
73 cl::desc("Maximum alloca size to use for inline cost"));
74
75// Inliner constraint to achieve reasonable compilation time.
77 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
78 cl::desc("Maximum number of BBs allowed in a function after inlining"
79 " (compile time constraint)"));
80
81// This default unroll factor is based on microbenchmarks on gfx1030.
83 "amdgpu-memcpy-loop-unroll",
84 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
85 "operations when lowering statically-sized memcpy, memmove, or"
86 "memset as a loop"),
87 cl::init(16), cl::Hidden);
88
89static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
90 unsigned Depth = 0) {
92 if (!I)
93 return false;
94
95 for (const Value *V : I->operand_values()) {
96 if (!L->contains(I))
97 continue;
98 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
99 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
100 return SubLoop->contains(PHI); }))
101 return true;
102 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
103 return true;
104 }
105 return false;
106}
107
109 : BaseT(TM, F.getDataLayout()),
110 TargetTriple(TM->getTargetTriple()),
111 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
112 TLI(ST->getTargetLowering()) {}
113
116 OptimizationRemarkEmitter *ORE) const {
117 const Function &F = *L->getHeader()->getParent();
118 UP.Threshold =
119 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
120 UP.MaxCount = std::numeric_limits<unsigned>::max();
121 UP.Partial = true;
122
123 // Conditional branch in a loop back edge needs 3 additional exec
124 // manipulations in average.
125 UP.BEInsns += 3;
126
127 // We want to run unroll even for the loops which have been vectorized.
128 UP.UnrollVectorizedLoop = true;
129
130 // TODO: Do we want runtime unrolling?
131
132 // Maximum alloca size than can fit registers. Reserve 16 registers.
133 const unsigned MaxAlloca = (256 - 16) * 4;
134 unsigned ThresholdPrivate = UnrollThresholdPrivate;
135 unsigned ThresholdLocal = UnrollThresholdLocal;
136
137 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
138 // provided threshold value as the default for Threshold
139 if (MDNode *LoopUnrollThreshold =
140 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
141 if (LoopUnrollThreshold->getNumOperands() == 2) {
143 LoopUnrollThreshold->getOperand(1));
144 if (MetaThresholdValue) {
145 // We will also use the supplied value for PartialThreshold for now.
146 // We may introduce additional metadata if it becomes necessary in the
147 // future.
148 UP.Threshold = MetaThresholdValue->getSExtValue();
150 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
151 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
152 }
153 }
154 }
155
156 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
157 for (const BasicBlock *BB : L->getBlocks()) {
158 const DataLayout &DL = BB->getDataLayout();
159 unsigned LocalGEPsSeen = 0;
160
161 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
162 return SubLoop->contains(BB); }))
163 continue; // Block belongs to an inner loop.
164
165 for (const Instruction &I : *BB) {
166 // Unroll a loop which contains an "if" statement whose condition
167 // defined by a PHI belonging to the loop. This may help to eliminate
168 // if region and potentially even PHI itself, saving on both divergence
169 // and registers used for the PHI.
170 // Add a small bonus for each of such "if" statements.
171 if (const CondBrInst *Br = dyn_cast<CondBrInst>(&I)) {
172 if (UP.Threshold < MaxBoost) {
173 BasicBlock *Succ0 = Br->getSuccessor(0);
174 BasicBlock *Succ1 = Br->getSuccessor(1);
175 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
176 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
177 continue;
178 if (dependsOnLocalPhi(L, Br->getCondition())) {
180 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
181 << " for loop:\n"
182 << *L << " due to " << *Br << '\n');
183 if (UP.Threshold >= MaxBoost)
184 return;
185 }
186 }
187 continue;
188 }
189
191 if (!GEP)
192 continue;
193
194 unsigned AS = GEP->getAddressSpace();
195 unsigned Threshold = 0;
197 Threshold = ThresholdPrivate;
199 Threshold = ThresholdLocal;
200 else
201 continue;
202
203 if (UP.Threshold >= Threshold)
204 continue;
205
206 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
207 const Value *Ptr = GEP->getPointerOperand();
208 const AllocaInst *Alloca =
210 if (!Alloca || !Alloca->isStaticAlloca())
211 continue;
212 auto AllocaSize = Alloca->getAllocationSize(DL);
213 if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca)
214 continue;
215 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
217 LocalGEPsSeen++;
218 // Inhibit unroll for local memory if we have seen addressing not to
219 // a variable, most likely we will be unable to combine it.
220 // Do not unroll too deep inner loops for local memory to give a chance
221 // to unroll an outer loop for a more important reason.
222 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
223 (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
224 !isa<Argument>(GEP->getPointerOperand())))
225 continue;
226 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
227 << *L << " due to LDS use.\n");
229 }
230
231 // Check if GEP depends on a value defined by this loop itself.
232 bool HasLoopDef = false;
233 for (const Value *Op : GEP->operands()) {
234 const Instruction *Inst = dyn_cast<Instruction>(Op);
235 if (!Inst || L->isLoopInvariant(Op))
236 continue;
237
238 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
239 return SubLoop->contains(Inst); }))
240 continue;
241 HasLoopDef = true;
242 break;
243 }
244 if (!HasLoopDef)
245 continue;
246
247 // We want to do whatever we can to limit the number of alloca
248 // instructions that make it through to the code generator. allocas
249 // require us to use indirect addressing, which is slow and prone to
250 // compiler bugs. If this loop does an address calculation on an
251 // alloca ptr, then we want to use a higher than normal loop unroll
252 // threshold. This will give SROA a better chance to eliminate these
253 // allocas.
254 //
255 // We also want to have more unrolling for local memory to let ds
256 // instructions with different offsets combine.
257 //
258 // Don't use the maximum allowed value here as it will make some
259 // programs way too big.
260 UP.Threshold = Threshold;
261 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
262 << " for loop:\n"
263 << *L << " due to " << *GEP << '\n');
264 if (UP.Threshold >= MaxBoost)
265 return;
266 }
267
268 // If we got a GEP in a small BB from inner loop then increase max trip
269 // count to analyze for better estimation cost in unroll
270 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
272 }
273}
274
279
283
284const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
285 // Codegen control options which don't matter.
286 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
287 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal,
288 AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureUnalignedAccessMode,
289
290 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
291
292 // Property of the kernel/environment which can't actually differ.
293 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
294 AMDGPU::FeatureTrapHandler,
295
296 // The default assumption needs to be ecc is enabled, but no directly
297 // exposed operations depend on it, so it can be safely inlined.
298 AMDGPU::FeatureSRAMECC,
299
300 // Perf-tuning features
301 AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops};
302
304 : BaseT(TM, F.getDataLayout()),
305 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
306 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
307 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
309 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
310 HasFP64FP16Denormals =
311 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
312}
313
315 return !F || !ST->isSingleLaneExecution(*F);
316}
317
318unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
319 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
320 // registers. See getRegisterClassForType for the implementation.
321 // In this case vector registers are not vector in terms of
322 // VGPRs, but those which can hold multiple values.
323
324 // This is really the number of registers to fill when vectorizing /
325 // interleaving loops, so we lie to avoid trying to use all registers.
326 return 4;
327}
328
331 switch (K) {
333 return TypeSize::getFixed(32);
335 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
337 return TypeSize::getScalable(0);
338 }
339 llvm_unreachable("Unsupported register kind");
340}
341
343 return 32;
344}
345
346unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
347 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
348 return 32 * 4 / ElemWidth;
349 // For a given width return the max 0number of elements that can be combined
350 // into a wider bit value:
351 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
352 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
353 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
354 : 1;
355}
356
357unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
358 unsigned ChainSizeInBytes,
359 VectorType *VecTy) const {
360 unsigned VecRegBitWidth = VF * LoadSize;
361 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
362 // TODO: Support element-size less than 32bit?
363 return 128 / LoadSize;
364
365 return VF;
366}
367
368unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
369 unsigned ChainSizeInBytes,
370 VectorType *VecTy) const {
371 unsigned VecRegBitWidth = VF * StoreSize;
372 if (VecRegBitWidth > 128)
373 return 128 / StoreSize;
374
375 return VF;
376}
377
378unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
379 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
380 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
382 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
383 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
385 return 512;
386 }
387
388 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
389 return 8 * ST->getMaxPrivateElementSize();
390
391 // Common to flat, global, local and region. Assume for unknown addrspace.
392 return 128;
393}
394
395bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
396 Align Alignment,
397 unsigned AddrSpace) const {
398 // We allow vectorization of flat stores, even though we may need to decompose
399 // them later if they may access private memory. We don't have enough context
400 // here, and legalization can handle it.
401 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
402 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
403 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
404 }
405 return true;
406}
407
408bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
409 Align Alignment,
410 unsigned AddrSpace) const {
411 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
412}
413
414bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
415 Align Alignment,
416 unsigned AddrSpace) const {
417 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
418}
419
423
425 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
426 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
427 std::optional<uint32_t> AtomicElementSize) const {
428
429 if (AtomicElementSize)
430 return Type::getIntNTy(Context, *AtomicElementSize * 8);
431
432 // 16-byte accesses achieve the highest copy throughput.
433 // If the operation has a fixed known length that is large enough, it is
434 // worthwhile to return an even wider type and let legalization lower it into
435 // multiple accesses, effectively unrolling the memcpy loop.
436 // We also rely on legalization to decompose into smaller accesses for
437 // subtargets and address spaces where it is necessary.
438 //
439 // Don't unroll if Length is not a constant, since unrolling leads to worse
440 // performance for length values that are smaller or slightly larger than the
441 // total size of the type returned here. Mitigating that would require a more
442 // complex lowering for variable-length memcpy and memmove.
443 unsigned I32EltsInVector = 4;
446 MemcpyLoopUnroll * I32EltsInVector);
447
448 return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector);
449}
450
452 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
453 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
454 Align SrcAlign, Align DestAlign,
455 std::optional<uint32_t> AtomicCpySize) const {
456
457 if (AtomicCpySize)
459 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
460 DestAlign, AtomicCpySize);
461
462 Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
463 while (RemainingBytes >= 16) {
464 OpsOut.push_back(I32x4Ty);
465 RemainingBytes -= 16;
466 }
467
468 Type *I64Ty = Type::getInt64Ty(Context);
469 while (RemainingBytes >= 8) {
470 OpsOut.push_back(I64Ty);
471 RemainingBytes -= 8;
472 }
473
474 Type *I32Ty = Type::getInt32Ty(Context);
475 while (RemainingBytes >= 4) {
476 OpsOut.push_back(I32Ty);
477 RemainingBytes -= 4;
478 }
479
480 Type *I16Ty = Type::getInt16Ty(Context);
481 while (RemainingBytes >= 2) {
482 OpsOut.push_back(I16Ty);
483 RemainingBytes -= 2;
484 }
485
486 Type *I8Ty = Type::getInt8Ty(Context);
487 while (RemainingBytes) {
488 OpsOut.push_back(I8Ty);
489 --RemainingBytes;
490 }
491}
492
494 // Disable unrolling if the loop is not vectorized.
495 // TODO: Enable this again.
496 if (VF.isScalar())
497 return 1;
498
499 return 8;
500}
501
503 MemIntrinsicInfo &Info) const {
504 switch (Inst->getIntrinsicID()) {
505 case Intrinsic::amdgcn_ds_ordered_add:
506 case Intrinsic::amdgcn_ds_ordered_swap: {
507 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
508 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
509 if (!Ordering || !Volatile)
510 return false; // Invalid.
511
512 unsigned OrderingVal = Ordering->getZExtValue();
513 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
514 return false;
515
516 Info.PtrVal = Inst->getArgOperand(0);
517 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
518 Info.ReadMem = true;
519 Info.WriteMem = true;
520 Info.IsVolatile = !Volatile->isZero();
521 return true;
522 }
523 default:
524 return false;
525 }
526}
527
529 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
531 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
532
533 // Legalize the type.
534 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
535 int ISD = TLI->InstructionOpcodeToISD(Opcode);
536
537 // Because we don't have any legal vector operations, but the legal types, we
538 // need to account for split vectors.
539 unsigned NElts = LT.second.isVector() ?
540 LT.second.getVectorNumElements() : 1;
541
542 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
543
544 switch (ISD) {
545 case ISD::SHL:
546 case ISD::SRL:
547 case ISD::SRA:
548 if (SLT == MVT::i64)
549 return get64BitInstrCost(CostKind) * LT.first * NElts;
550
551 if (ST->has16BitInsts() && SLT == MVT::i16)
552 NElts = (NElts + 1) / 2;
553
554 // i32
555 return getFullRateInstrCost() * LT.first * NElts;
556 case ISD::ADD:
557 case ISD::SUB:
558 case ISD::AND:
559 case ISD::OR:
560 case ISD::XOR:
561 if (SLT == MVT::i64) {
562 // and, or and xor are typically split into 2 VALU instructions.
563 return 2 * getFullRateInstrCost() * LT.first * NElts;
564 }
565
566 if (ST->has16BitInsts() && SLT == MVT::i16)
567 NElts = (NElts + 1) / 2;
568
569 return LT.first * NElts * getFullRateInstrCost();
570 case ISD::MUL: {
571 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
572 if (SLT == MVT::i64) {
573 const int FullRateCost = getFullRateInstrCost();
574 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
575 }
576
577 if (ST->has16BitInsts() && SLT == MVT::i16)
578 NElts = (NElts + 1) / 2;
579
580 // i32
581 return QuarterRateCost * NElts * LT.first;
582 }
583 case ISD::FMUL:
584 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
585 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
586 // fused operation.
587 if (CxtI && CxtI->hasOneUse())
588 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
589 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
590 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
591 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
593 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
595
596 // Estimate all types may be fused with contract/unsafe flags
597 const TargetOptions &Options = TLI->getTargetMachine().Options;
598 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
599 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
601 }
602 }
603 [[fallthrough]];
604 case ISD::FADD:
605 case ISD::FSUB:
606 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
607 NElts = (NElts + 1) / 2;
608 if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
609 NElts = (NElts + 1) / 2;
610 if (SLT == MVT::f64)
611 return LT.first * NElts * get64BitInstrCost(CostKind);
612
613 if (ST->has16BitInsts() && SLT == MVT::f16)
614 NElts = (NElts + 1) / 2;
615
616 if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
617 return LT.first * NElts * getFullRateInstrCost();
618 break;
619 case ISD::FDIV:
620 case ISD::FREM:
621 // FIXME: frem should be handled separately. The fdiv in it is most of it,
622 // but the current lowering is also not entirely correct.
623 if (SLT == MVT::f64) {
624 int Cost = 7 * get64BitInstrCost(CostKind) +
625 getQuarterRateInstrCost(CostKind) +
626 3 * getHalfRateInstrCost(CostKind);
627 // Add cost of workaround.
628 if (!ST->hasUsableDivScaleConditionOutput())
629 Cost += 3 * getFullRateInstrCost();
630
631 return LT.first * Cost * NElts;
632 }
633
634 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
635 // TODO: This is more complicated, unsafe flags etc.
636 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
637 (SLT == MVT::f16 && ST->has16BitInsts())) {
638 return LT.first * getTransInstrCost(CostKind) * NElts;
639 }
640 }
641
642 if (SLT == MVT::f16 && ST->has16BitInsts()) {
643 // 2 x v_cvt_f32_f16
644 // f32 rcp
645 // f32 fmul
646 // v_cvt_f16_f32
647 // f16 div_fixup
648 int Cost = 4 * getFullRateInstrCost() + 2 * getTransInstrCost(CostKind);
649 return LT.first * Cost * NElts;
650 }
651
652 if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
653 // Fast unsafe fdiv lowering:
654 // f32 rcp
655 // f32 fmul
656 int Cost = getTransInstrCost(CostKind) + getFullRateInstrCost();
657 return LT.first * Cost * NElts;
658 }
659
660 if (SLT == MVT::f32 || SLT == MVT::f16) {
661 // 4 more v_cvt_* insts without f16 insts support
662 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
663 1 * getTransInstrCost(CostKind);
664
665 if (!HasFP32Denormals) {
666 // FP mode switches.
667 Cost += 2 * getFullRateInstrCost();
668 }
669
670 return LT.first * NElts * Cost;
671 }
672 break;
673 case ISD::FNEG:
674 // Use the backend' estimation. If fneg is not free each element will cost
675 // one additional instruction.
676 return TLI->isFNegFree(SLT) ? 0 : NElts;
677 default:
678 break;
679 }
680
681 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
682 Args, CxtI);
683}
684
685// Return true if there's a potential benefit from using v2f16/v2i16
686// instructions for an intrinsic, even if it requires nontrivial legalization.
688 switch (ID) {
689 case Intrinsic::fma:
690 case Intrinsic::fmuladd:
691 case Intrinsic::copysign:
692 case Intrinsic::minimumnum:
693 case Intrinsic::maximumnum:
694 case Intrinsic::canonicalize:
695 // There's a small benefit to using vector ops in the legalized code.
696 case Intrinsic::round:
697 case Intrinsic::uadd_sat:
698 case Intrinsic::usub_sat:
699 case Intrinsic::sadd_sat:
700 case Intrinsic::ssub_sat:
701 case Intrinsic::abs:
702 return true;
703 default:
704 return false;
705 }
706}
707
711 switch (ICA.getID()) {
712 case Intrinsic::fabs:
713 // Free source modifier in the common case.
714 return 0;
715 case Intrinsic::amdgcn_workitem_id_x:
716 case Intrinsic::amdgcn_workitem_id_y:
717 case Intrinsic::amdgcn_workitem_id_z:
718 // TODO: If hasPackedTID, or if the calling context is not an entry point
719 // there may be a bit instruction.
720 return 0;
721 case Intrinsic::amdgcn_workgroup_id_x:
722 case Intrinsic::amdgcn_workgroup_id_y:
723 case Intrinsic::amdgcn_workgroup_id_z:
724 case Intrinsic::amdgcn_lds_kernel_id:
725 case Intrinsic::amdgcn_dispatch_ptr:
726 case Intrinsic::amdgcn_dispatch_id:
727 case Intrinsic::amdgcn_implicitarg_ptr:
728 case Intrinsic::amdgcn_queue_ptr:
729 // Read from an argument register.
730 return 0;
731 default:
732 break;
733 }
734
735 Type *RetTy = ICA.getReturnType();
736
737 Intrinsic::ID IID = ICA.getID();
738 switch (IID) {
739 case Intrinsic::exp:
740 case Intrinsic::exp2:
741 case Intrinsic::exp10: {
742 // Legalize the type.
743 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
744 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
745 unsigned NElts =
746 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
747
748 if (SLT == MVT::f64) {
749 unsigned NumOps = 20;
750 if (IID == Intrinsic::exp)
751 ++NumOps;
752 else if (IID == Intrinsic::exp10)
753 NumOps += 3;
754
755 return LT.first * NElts * NumOps * get64BitInstrCost(CostKind);
756 }
757
758 if (SLT == MVT::f32) {
759 unsigned NumFullRateOps = 0;
760 // v_exp_f32 (transcendental).
761 unsigned NumTransOps = 1;
762
763 if (!ICA.getFlags().approxFunc() && IID != Intrinsic::exp2) {
764 // Non-AFN exp/exp10: range reduction + v_exp_f32 + ldexp +
765 // overflow/underflow checks (lowerFEXP). Denorm is also handled.
766 // FMA preamble: ~13 full-rate ops; non-FMA: ~17.
767 NumFullRateOps = ST->hasFastFMAF32() ? 13 : 17;
768 } else {
769 if (IID == Intrinsic::exp) {
770 // lowerFEXPUnsafe: fmul (base conversion) + v_exp_f32.
771 NumFullRateOps = 1;
772 } else if (IID == Intrinsic::exp10) {
773 // lowerFEXP10Unsafe: 3 fmul + 2 v_exp_f32 (double-exp2).
774 NumFullRateOps = 3;
775 NumTransOps = 2;
776 }
777 // Denorm scaling adds setcc + select + fadd + select + fmul.
778 if (HasFP32Denormals)
779 NumFullRateOps += 5;
780 }
781
782 InstructionCost Cost = NumFullRateOps * getFullRateInstrCost() +
783 NumTransOps * getTransInstrCost(CostKind);
784 return LT.first * NElts * Cost;
785 }
786
787 break;
788 }
789 case Intrinsic::log:
790 case Intrinsic::log2:
791 case Intrinsic::log10: {
792 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
793 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
794 unsigned NElts =
795 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
796
797 if (SLT == MVT::f32) {
798 unsigned NumFullRateOps = 0;
799
800 if (IID == Intrinsic::log2) {
801 // LowerFLOG2: just v_log_f32.
802 } else if (ICA.getFlags().approxFunc()) {
803 // LowerFLOGUnsafe: v_log_f32 + fmul (base conversion).
804 NumFullRateOps = 1;
805 } else {
806 // LowerFLOGCommon non-AFN: v_log_f32 + extended-precision
807 // multiply + finite check.
808 NumFullRateOps = ST->hasFastFMAF32() ? 8 : 11;
809 }
810
811 if (HasFP32Denormals)
812 NumFullRateOps += 5;
813
815 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
816 return LT.first * NElts * Cost;
817 }
818
819 break;
820 }
821 case Intrinsic::sin:
822 case Intrinsic::cos: {
823 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
824 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
825 unsigned NElts =
826 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
827
828 if (SLT == MVT::f32) {
829 // LowerTrig: fmul(1/2pi) + v_sin/v_cos.
830 unsigned NumFullRateOps = ST->hasTrigReducedRange() ? 2 : 1;
831
833 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
834 return LT.first * NElts * Cost;
835 }
836
837 break;
838 }
839 case Intrinsic::sqrt: {
840 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
841 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
842 unsigned NElts =
843 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
844
845 if (SLT == MVT::f32) {
846 unsigned NumFullRateOps = 0;
847
848 if (!ICA.getFlags().approxFunc()) {
849 // lowerFSQRTF32 non-AFN: v_sqrt_f32 + refinement + scale fixup.
850 NumFullRateOps = HasFP32Denormals ? 17 : 16;
851 }
852
854 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
855 return LT.first * NElts * Cost;
856 }
857
858 break;
859 }
860 default:
861 break;
862 }
863
866
867 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
868 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
869 unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
870
871 if ((ST->hasVOP3PInsts() &&
872 (SLT == MVT::f16 || SLT == MVT::i16 ||
873 (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
874 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
875 NElts = (NElts + 1) / 2;
876
877 // TODO: Get more refined intrinsic costs?
878 unsigned InstRate = getQuarterRateInstrCost(CostKind);
879
880 switch (ICA.getID()) {
881 case Intrinsic::fma:
882 case Intrinsic::fmuladd:
883 if (SLT == MVT::f64) {
884 InstRate = get64BitInstrCost(CostKind);
885 break;
886 }
887
888 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
889 InstRate = getFullRateInstrCost();
890 else {
891 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
892 : getQuarterRateInstrCost(CostKind);
893 }
894 break;
895 case Intrinsic::copysign:
896 return NElts * getFullRateInstrCost();
897 case Intrinsic::minimumnum:
898 case Intrinsic::maximumnum: {
899 // Instruction + 2 canonicalizes. For cases that need type promotion, we the
900 // promotion takes the place of the canonicalize.
901 unsigned NumOps = 3;
902 if (const IntrinsicInst *II = ICA.getInst()) {
903 // Directly legal with ieee=0
904 // TODO: Not directly legal with strictfp
906 NumOps = 1;
907 }
908
909 unsigned BaseRate =
910 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
911 InstRate = BaseRate * NumOps;
912 break;
913 }
914 case Intrinsic::canonicalize: {
915 InstRate =
916 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
917 break;
918 }
919 case Intrinsic::uadd_sat:
920 case Intrinsic::usub_sat:
921 case Intrinsic::sadd_sat:
922 case Intrinsic::ssub_sat: {
923 if (SLT == MVT::i16 || SLT == MVT::i32)
924 InstRate = getFullRateInstrCost();
925
926 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
927 if (any_of(ValidSatTys, equal_to(LT.second)))
928 NElts = 1;
929 break;
930 }
931 case Intrinsic::abs:
932 // Expansion takes 2 instructions for VALU
933 if (SLT == MVT::i16 || SLT == MVT::i32)
934 InstRate = 2 * getFullRateInstrCost();
935 break;
936 default:
937 break;
938 }
939
940 return LT.first * NElts * InstRate;
941}
942
945 const Instruction *I) const {
946 assert((I == nullptr || I->getOpcode() == Opcode) &&
947 "Opcode should reflect passed instruction.");
948 const bool SCost =
950 const int CBrCost = SCost ? 5 : 7;
951 switch (Opcode) {
952 case Instruction::UncondBr:
953 // Branch instruction takes about 4 slots on gfx900.
954 return SCost ? 1 : 4;
955 case Instruction::CondBr:
956 // Suppose conditional branch takes additional 3 exec manipulations
957 // instructions in average.
958 return CBrCost;
959 case Instruction::Switch: {
960 const auto *SI = dyn_cast_or_null<SwitchInst>(I);
961 // Each case (including default) takes 1 cmp + 1 cbr instructions in
962 // average.
963 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
964 }
965 case Instruction::Ret:
966 return SCost ? 1 : 10;
967 }
968 return BaseT::getCFInstrCost(Opcode, CostKind, I);
969}
970
973 std::optional<FastMathFlags> FMF,
976 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
977
978 EVT OrigTy = TLI->getValueType(DL, Ty);
979
980 // Computes cost on targets that have packed math instructions(which support
981 // 16-bit types only).
982 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
983 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
984
985 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
986 return LT.first * getFullRateInstrCost();
987}
988
991 FastMathFlags FMF,
993 EVT OrigTy = TLI->getValueType(DL, Ty);
994
995 // Computes cost on targets that have packed math instructions(which support
996 // 16-bit types only).
997 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
998 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
999
1000 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1001 return LT.first * getHalfRateInstrCost(CostKind);
1002}
1003
1005 unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
1006 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
1007 switch (Opcode) {
1008 case Instruction::ExtractElement:
1009 case Instruction::InsertElement: {
1010 unsigned EltSize
1011 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
1012 if (EltSize < 32) {
1013 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
1014 return 0;
1015 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
1016 VIC);
1017 }
1018
1019 // Extracts are just reads of a subregister, so are free. Inserts are
1020 // considered free because we don't want to have any cost for scalarizing
1021 // operations, and we don't have to copy into a different register class.
1022
1023 // Dynamic indexing isn't free and is best avoided.
1024 return Index == ~0u ? 2 : 0;
1025 }
1026 default:
1027 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
1028 VIC);
1029 }
1030}
1031
1032/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
1033/// this is analyzing the collective result of all output registers. Otherwise,
1034/// this is only querying a specific result index if this returns multiple
1035/// registers in a struct.
1037 const CallInst *CI, ArrayRef<unsigned> Indices) const {
1038 // TODO: Handle complex extract indices
1039 if (Indices.size() > 1)
1040 return true;
1041
1042 const DataLayout &DL = CI->getDataLayout();
1043 const SIRegisterInfo *TRI = ST->getRegisterInfo();
1044 TargetLowering::AsmOperandInfoVector TargetConstraints =
1045 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
1046
1047 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
1048
1049 int OutputIdx = 0;
1050 for (auto &TC : TargetConstraints) {
1051 if (TC.Type != InlineAsm::isOutput)
1052 continue;
1053
1054 // Skip outputs we don't care about.
1055 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
1056 continue;
1057
1058 TLI->ComputeConstraintToUse(TC, SDValue());
1059
1060 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
1061 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
1062
1063 // For AGPR constraints null is returned on subtargets without AGPRs, so
1064 // assume divergent for null.
1065 if (!RC || !TRI->isSGPRClass(RC))
1066 return true;
1067 }
1068
1069 return false;
1070}
1071
1073 const IntrinsicInst *ReadReg) const {
1074 Metadata *MD =
1075 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
1077 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
1078
1079 // Special case registers that look like VCC.
1080 MVT VT = MVT::getVT(ReadReg->getType());
1081 if (VT == MVT::i1)
1082 return true;
1083
1084 // Special case scalar registers that start with 'v'.
1085 if (RegName.starts_with("vcc") || RegName.empty())
1086 return false;
1087
1088 // VGPR or AGPR is divergent. There aren't any specially named vector
1089 // registers.
1090 return RegName[0] == 'v' || RegName[0] == 'a';
1091}
1092
1093/// \returns true if the result of the value could potentially be
1094/// different across workitems in a wavefront.
1095bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
1096 if (const Argument *A = dyn_cast<Argument>(V))
1098
1099 // Loads from the private and flat address spaces are divergent, because
1100 // threads can execute the load instruction with the same inputs and get
1101 // different results.
1102 //
1103 // All other loads are not divergent, because if threads issue loads with the
1104 // same arguments, they will always get the same result.
1105 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
1106 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
1107 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
1108
1109 // Atomics are divergent because they are executed sequentially: when an
1110 // atomic operation refers to the same address in each thread, then each
1111 // thread after the first sees the value written by the previous thread as
1112 // original value.
1114 return true;
1115
1117 Intrinsic::ID IID = Intrinsic->getIntrinsicID();
1118 switch (IID) {
1119 case Intrinsic::read_register:
1121 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1122 unsigned SrcAS =
1123 Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
1124 unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
1125 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
1126 DstAS == AMDGPUAS::FLAT_ADDRESS &&
1127 ST->hasGloballyAddressableScratch();
1128 }
1129 case Intrinsic::amdgcn_workitem_id_y:
1130 case Intrinsic::amdgcn_workitem_id_z: {
1131 const Function *F = Intrinsic->getFunction();
1132 bool HasUniformYZ =
1133 ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);
1134 std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
1135 *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1136 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1137 }
1138 default:
1140 }
1141 }
1142
1143 // Assume all function calls are a source of divergence.
1144 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1145 if (CI->isInlineAsm())
1147 return true;
1148 }
1149
1150 // Assume all function calls are a source of divergence.
1151 if (isa<InvokeInst>(V))
1152 return true;
1153
1154 // If the target supports globally addressable scratch, the mapping from
1155 // scratch memory to the flat aperture changes therefore an address space cast
1156 // is no longer uniform.
1157 if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {
1158 return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
1159 CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
1160 ST->hasGloballyAddressableScratch();
1161 }
1162
1163 return false;
1164}
1165
1166bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
1167 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
1168 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
1169
1170 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1171 if (CI->isInlineAsm())
1173 return false;
1174 }
1175
1176 // In most cases TID / wavefrontsize is uniform.
1177 //
1178 // However, if a kernel has uneven dimesions we can have a value of
1179 // workitem-id-x divided by the wavefrontsize non-uniform. For example
1180 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1181 // packed into a same wave which gives 1 and 0 after the division by 64
1182 // respectively.
1183 //
1184 // The X dimension doesn't reset within a wave if either both the Y
1185 // and Z dimensions are of length 1, or if the X dimension's required
1186 // size is a power of 2. Note, however, if the X dimension's maximum
1187 // size is a power of 2 < the wavefront size, division by the wavefront
1188 // size is guaranteed to yield 0, so this is also a no-reset case.
1189 bool XDimDoesntResetWithinWaves = false;
1190 if (auto *I = dyn_cast<Instruction>(V)) {
1191 const Function *F = I->getFunction();
1192 XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);
1193 }
1194 using namespace llvm::PatternMatch;
1195 uint64_t C;
1197 m_ConstantInt(C))) ||
1199 m_ConstantInt(C)))) {
1200 return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
1201 }
1202
1203 Value *Mask;
1205 m_Value(Mask)))) {
1206 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
1207 ST->getWavefrontSizeLog2() &&
1208 XDimDoesntResetWithinWaves;
1209 }
1210
1211 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
1212 if (!ExtValue)
1213 return false;
1214
1215 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
1216 if (!CI)
1217 return false;
1218
1219 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1220 switch (Intrinsic->getIntrinsicID()) {
1221 default:
1222 return false;
1223 case Intrinsic::amdgcn_if:
1224 case Intrinsic::amdgcn_else: {
1225 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1226 return Indices.size() == 1 && Indices[0] == 1;
1227 }
1228 }
1229 }
1230
1231 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1232 // divergent for the overall struct return. We need to override it in the
1233 // case we're extracting an SGPR component here.
1234 if (CI->isInlineAsm())
1235 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1236
1237 return false;
1238}
1239
1241 Intrinsic::ID IID) const {
1242 switch (IID) {
1243 case Intrinsic::amdgcn_is_shared:
1244 case Intrinsic::amdgcn_is_private:
1245 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1246 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1247 case Intrinsic::amdgcn_load_to_lds:
1248 case Intrinsic::amdgcn_make_buffer_rsrc:
1249 OpIndexes.push_back(0);
1250 return true;
1251 default:
1252 return false;
1253 }
1254}
1255
1257 Value *OldV,
1258 Value *NewV) const {
1259 auto IntrID = II->getIntrinsicID();
1260 switch (IntrID) {
1261 case Intrinsic::amdgcn_is_shared:
1262 case Intrinsic::amdgcn_is_private: {
1263 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1265 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1266 LLVMContext &Ctx = NewV->getType()->getContext();
1267 ConstantInt *NewVal = (TrueAS == NewAS) ?
1269 return NewVal;
1270 }
1271 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1272 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1273 Type *DestTy = II->getType();
1274 Type *SrcTy = NewV->getType();
1275 unsigned NewAS = SrcTy->getPointerAddressSpace();
1277 return nullptr;
1278 Module *M = II->getModule();
1280 M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
1281 II->setArgOperand(0, NewV);
1282 II->setCalledFunction(NewDecl);
1283 return II;
1284 }
1285 case Intrinsic::amdgcn_load_to_lds: {
1286 Type *SrcTy = NewV->getType();
1287 Module *M = II->getModule();
1288 Function *NewDecl =
1289 Intrinsic::getOrInsertDeclaration(M, II->getIntrinsicID(), {SrcTy});
1290 II->setArgOperand(0, NewV);
1291 II->setCalledFunction(NewDecl);
1292 return II;
1293 }
1294 case Intrinsic::amdgcn_make_buffer_rsrc: {
1295 Type *SrcTy = NewV->getType();
1296 Type *DstTy = II->getType();
1297 Module *M = II->getModule();
1299 M, II->getIntrinsicID(), {DstTy, SrcTy});
1300 II->setArgOperand(0, NewV);
1301 II->setCalledFunction(NewDecl);
1302 return II;
1303 }
1304 default:
1305 return nullptr;
1306 }
1307}
1308
1310 VectorType *DstTy, VectorType *SrcTy,
1311 ArrayRef<int> Mask,
1313 int Index, VectorType *SubTp,
1315 const Instruction *CxtI) const {
1316 if (!isa<FixedVectorType>(SrcTy))
1317 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1318 SubTp);
1319
1320 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1321
1322 unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());
1323 if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1324 (ScalarSize == 16 || ScalarSize == 8)) {
1325 // Larger vector widths may require additional instructions, but are
1326 // typically cheaper than scalarized versions.
1327 //
1328 // We assume that shuffling at a register granularity can be done for free.
1329 // This is not true for vectors fed into memory instructions, but it is
1330 // effectively true for all other shuffling. The emphasis of the logic here
1331 // is to assist generic transform in cleaning up / canonicalizing those
1332 // shuffles.
1333
1334 // With op_sel VOP3P instructions freely can access the low half or high
1335 // half of a register, so any swizzle of two elements is free.
1336 if (auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy)) {
1337 unsigned NumSrcElts = SrcVecTy->getNumElements();
1338 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
1339 (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
1340 Kind == TTI::SK_PermuteSingleSrc))
1341 return 0;
1342 }
1343
1344 unsigned EltsPerReg = 32 / ScalarSize;
1345 switch (Kind) {
1346 case TTI::SK_Broadcast:
1347 // A single v_perm_b32 can be re-used for all destination registers.
1348 return 1;
1349 case TTI::SK_Reverse:
1350 // One instruction per register.
1351 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1352 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1355 if (Index % EltsPerReg == 0)
1356 return 0; // Shuffling at register granularity
1357 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1358 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1361 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1362 if (!DstVecTy)
1364 unsigned NumDstElts = DstVecTy->getNumElements();
1365 unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements();
1366 unsigned EndIndex = Index + NumInsertElts;
1367 unsigned BeginSubIdx = Index % EltsPerReg;
1368 unsigned EndSubIdx = EndIndex % EltsPerReg;
1369 unsigned Cost = 0;
1370
1371 if (BeginSubIdx != 0) {
1372 // Need to shift the inserted vector into place. The cost is the number
1373 // of destination registers overlapped by the inserted vector.
1374 Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg);
1375 }
1376
1377 // If the last register overlap is partial, there may be three source
1378 // registers feeding into it; that takes an extra instruction.
1379 if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
1380 Cost += 1;
1381
1382 return Cost;
1383 }
1384 case TTI::SK_Splice: {
1385 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1386 if (!DstVecTy)
1388 unsigned NumElts = DstVecTy->getNumElements();
1389 assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
1390 // Determine the sub-region of the result vector that requires
1391 // sub-register shuffles / mixing.
1392 unsigned EltsFromLHS = NumElts - Index;
1393 bool LHSIsAligned = (Index % EltsPerReg) == 0;
1394 bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
1395 if (LHSIsAligned && RHSIsAligned)
1396 return 0;
1397 if (LHSIsAligned && !RHSIsAligned)
1398 return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);
1399 if (!LHSIsAligned && RHSIsAligned)
1400 return divideCeil(EltsFromLHS, EltsPerReg);
1401 return divideCeil(NumElts, EltsPerReg);
1402 }
1403 default:
1404 break;
1405 }
1406
1407 if (!Mask.empty()) {
1408 unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();
1409
1410 // Generically estimate the cost by assuming that each destination
1411 // register is derived from sources via v_perm_b32 instructions if it
1412 // can't be copied as-is.
1413 //
1414 // For each destination register, derive the cost of obtaining it based
1415 // on the number of source registers that feed into it.
1416 unsigned Cost = 0;
1417 for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
1419 bool Aligned = true;
1420 for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
1421 int SrcIdx = Mask[DstIdx + I];
1422 if (SrcIdx == -1)
1423 continue;
1424 int Reg;
1425 if (SrcIdx < (int)NumSrcElts) {
1426 Reg = SrcIdx / EltsPerReg;
1427 if (SrcIdx % EltsPerReg != I)
1428 Aligned = false;
1429 } else {
1430 Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
1431 if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
1432 Aligned = false;
1433 }
1434 if (!llvm::is_contained(Regs, Reg))
1435 Regs.push_back(Reg);
1436 }
1437 if (Regs.size() >= 2)
1438 Cost += Regs.size() - 1;
1439 else if (!Aligned)
1440 Cost += 1;
1441 }
1442 return Cost;
1443 }
1444 }
1445
1446 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1447 SubTp);
1448}
1449
1450/// Whether it is profitable to sink the operands of an
1451/// Instruction I to the basic block of I.
1452/// This helps using several modifiers (like abs and neg) more often.
1454 SmallVectorImpl<Use *> &Ops) const {
1455 using namespace PatternMatch;
1456
1457 for (auto &Op : I->operands()) {
1458 // Ensure we are not already sinking this operand.
1459 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
1460 continue;
1461
1462 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) {
1463 Ops.push_back(&Op);
1464 continue;
1465 }
1466
1467 // Check for zero-cost multiple use InsertElement/ExtractElement
1468 // instructions
1469 if (Instruction *OpInst = dyn_cast<Instruction>(Op.get())) {
1470 if (OpInst->getType()->isVectorTy() && OpInst->getNumOperands() > 1) {
1471 Instruction *VecOpInst = dyn_cast<Instruction>(OpInst->getOperand(0));
1472 if (VecOpInst && VecOpInst->hasOneUse())
1473 continue;
1474
1475 if (getVectorInstrCost(OpInst->getOpcode(), OpInst->getType(),
1477 OpInst->getOperand(0),
1478 OpInst->getOperand(1)) == 0) {
1479 Ops.push_back(&Op);
1480 continue;
1481 }
1482 }
1483 }
1484
1485 if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
1486
1487 unsigned EltSize = DL.getTypeSizeInBits(
1488 cast<VectorType>(Shuffle->getType())->getElementType());
1489
1490 // For i32 (or greater) shufflevectors, these will be lowered into a
1491 // series of insert / extract elements, which will be coalesced away.
1492 if (EltSize < 16 || !ST->has16BitInsts())
1493 continue;
1494
1495 int NumSubElts, SubIndex;
1496 if (Shuffle->changesLength()) {
1497 if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1498 Ops.push_back(&Op);
1499 continue;
1500 }
1501
1502 if ((Shuffle->isExtractSubvectorMask(SubIndex) ||
1503 Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) &&
1504 !(SubIndex & 0x1)) {
1505 Ops.push_back(&Op);
1506 continue;
1507 }
1508 }
1509
1510 if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1511 Shuffle->isSingleSource()) {
1512 Ops.push_back(&Op);
1513 continue;
1514 }
1515 }
1516 }
1517
1518 return !Ops.empty();
1519}
1520
1522 const Function *Callee) const {
1523 const TargetMachine &TM = getTLI()->getTargetMachine();
1524 const GCNSubtarget *CallerST
1525 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1526 const GCNSubtarget *CalleeST
1527 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1528
1529 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1530 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1531
1532 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1533 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1534 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1535 return false;
1536
1537 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1538 // no way to support merge for backend defined attributes.
1539 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1540 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1541 if (!CallerMode.isInlineCompatible(CalleeMode))
1542 return false;
1543
1544 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1545 Callee->hasFnAttribute(Attribute::InlineHint))
1546 return true;
1547
1548 // Hack to make compile times reasonable.
1549 if (InlineMaxBB) {
1550 // Single BB does not increase total BB amount.
1551 if (Callee->size() == 1)
1552 return true;
1553 size_t BBSize = Caller->size() + Callee->size() - 1;
1554 return BBSize <= InlineMaxBB;
1555 }
1556
1557 return true;
1558}
1559
1561 const SITargetLowering *TLI,
1562 const GCNTTIImpl *TTIImpl) {
1563 const int NrOfSGPRUntilSpill = 26;
1564 const int NrOfVGPRUntilSpill = 32;
1565
1566 const DataLayout &DL = TTIImpl->getDataLayout();
1567
1568 unsigned adjustThreshold = 0;
1569 int SGPRsInUse = 0;
1570 int VGPRsInUse = 0;
1571 for (const Use &A : CB->args()) {
1572 SmallVector<EVT, 4> ValueVTs;
1573 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1574 for (auto ArgVT : ValueVTs) {
1575 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1576 CB->getContext(), CB->getCallingConv(), ArgVT);
1578 SGPRsInUse += CCRegNum;
1579 else
1580 VGPRsInUse += CCRegNum;
1581 }
1582 }
1583
1584 // The cost of passing function arguments through the stack:
1585 // 1 instruction to put a function argument on the stack in the caller.
1586 // 1 instruction to take a function argument from the stack in callee.
1587 // 1 instruction is explicitly take care of data dependencies in callee
1588 // function.
1589 InstructionCost ArgStackCost(1);
1590 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1591 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1593 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1594 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1596
1597 // The penalty cost is computed relative to the cost of instructions and does
1598 // not model any storage costs.
1599 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1600 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1601 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1602 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1603 return adjustThreshold;
1604}
1605
1606static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1607 const DataLayout &DL) {
1608 // If we have a pointer to a private array passed into a function
1609 // it will not be optimized out, leaving scratch usage.
1610 // This function calculates the total size in bytes of the memory that would
1611 // end in scratch if the call was not inlined.
1612 unsigned AllocaSize = 0;
1614 for (Value *PtrArg : CB->args()) {
1615 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1616 if (!Ty)
1617 continue;
1618
1619 unsigned AddrSpace = Ty->getAddressSpace();
1620 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1621 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1622 continue;
1623
1625 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1626 continue;
1627
1628 if (auto Size = AI->getAllocationSize(DL))
1629 AllocaSize += Size->getFixedValue();
1630 }
1631 return AllocaSize;
1632}
1633
1638
1640 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
1641
1642 // Private object passed as arguments may end up in scratch usage if the call
1643 // is not inlined. Increase the inline threshold to promote inlining.
1644 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1645 if (AllocaSize > 0)
1646 Threshold += ArgAllocaCost;
1647 return Threshold;
1648}
1649
1651 const AllocaInst *AI) const {
1652
1653 // Below the cutoff, assume that the private memory objects would be
1654 // optimized
1655 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1656 if (AllocaSize <= ArgAllocaCutoff)
1657 return 0;
1658
1659 // Above the cutoff, we give a cost to each private memory object
1660 // depending its size. If the array can be optimized by SROA this cost is not
1661 // added to the total-cost in the inliner cost analysis.
1662 //
1663 // We choose the total cost of the alloca such that their sum cancels the
1664 // bonus given in the threshold (ArgAllocaCost).
1665 //
1666 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1667 //
1668 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1669 // the single-bb bonus and the vector-bonus.
1670 //
1671 // We compensate the first two multipliers, by repeating logic from the
1672 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1673 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1674 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1675
1676 bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
1677 return BB.getTerminator()->getNumSuccessors() > 1;
1678 });
1679 if (SingleBB) {
1680 Threshold += Threshold / 2;
1681 }
1682
1683 auto ArgAllocaSize = AI->getAllocationSize(DL);
1684 if (!ArgAllocaSize)
1685 return 0;
1686
1687 // Attribute the bonus proportionally to the alloca size
1688 unsigned AllocaThresholdBonus =
1689 (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize;
1690
1691 return AllocaThresholdBonus;
1692}
1693
1696 OptimizationRemarkEmitter *ORE) const {
1697 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1698}
1699
1701 TTI::PeelingPreferences &PP) const {
1702 CommonTTI.getPeelingPreferences(L, SE, PP);
1703}
1704
1705int GCNTTIImpl::getTransInstrCost(TTI::TargetCostKind CostKind) const {
1706 return getQuarterRateInstrCost(CostKind);
1707}
1708
1709int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1710 return ST->hasFullRate64Ops()
1711 ? getFullRateInstrCost()
1712 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1713 : getQuarterRateInstrCost(CostKind);
1714}
1715
1716std::pair<InstructionCost, MVT>
1717GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1718 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1719 auto Size = DL.getTypeSizeInBits(Ty);
1720 // Maximum load or store can handle 8 dwords for scalar and 4 for
1721 // vector ALU. Let's assume anything above 8 dwords is expensive
1722 // even if legal.
1723 if (Size <= 256)
1724 return Cost;
1725
1726 Cost.first += (Size + 255) / 256;
1727 return Cost;
1728}
1729
1731 return ST->hasPrefetch() ? 128 : 0;
1732}
1733
1736}
1737
1739 const Function &F,
1740 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1741 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1742 LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1743 LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1744 LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1745 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1746 ST->getFlatWorkGroupSizes(F);
1747 LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1748 LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1749 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1750 LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1751 LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1752}
1753
1756 if (!ST->hasFeature(AMDGPU::FeatureDX10ClampAndIEEEMode))
1757 return KnownIEEEMode::On; // Only mode on gfx1170+
1758
1759 const Function *F = I.getFunction();
1760 if (!F)
1762
1763 Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");
1764 if (IEEEAttr.isValid())
1766
1767 return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
1769}
1770
1772 Align Alignment,
1773 unsigned AddressSpace,
1775 TTI::OperandValueInfo OpInfo,
1776 const Instruction *I) const {
1777 if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1778 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1779 VecTy->getElementType()->isIntegerTy(8)) {
1780 return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
1782 }
1783 }
1784 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1785 OpInfo, I);
1786}
1787
1789 if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1790 if (VecTy->getElementType()->isIntegerTy(8)) {
1791 unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1792 return divideCeil(ElementCount - 1, 4);
1793 }
1794 }
1795 return BaseT::getNumberOfParts(Tp);
1796}
1797
1800 switch (Intrinsic->getIntrinsicID()) {
1801 case Intrinsic::amdgcn_wave_shuffle:
1803 default:
1804 break;
1805 }
1806 }
1807
1808 if (isAlwaysUniform(V))
1810
1811 if (isSourceOfDivergence(V))
1813
1815}
1816
1818 StackOffset BaseOffset,
1819 bool HasBaseReg, int64_t Scale,
1820 unsigned AddrSpace) const {
1821 if (HasBaseReg && Scale != 0) {
1822 // gfx1250+ can fold base+scale*index when scale matches the memory access
1823 // size (scale_offset bit). Supported for flat/global/constant/scratch
1824 // (VMEM, max 128 bits) and constant_32bit (SMRD, capped to 128 bits here).
1825 if (getST()->hasScaleOffset() && Ty && Ty->isSized() &&
1827 AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
1828 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)) {
1829 TypeSize StoreSize = getDataLayout().getTypeStoreSize(Ty);
1830 if (TypeSize::isKnownLE(StoreSize, TypeSize::getFixed(16)) &&
1831 static_cast<int64_t>(StoreSize.getFixedValue()) == Scale)
1832 return 0;
1833 }
1834 return 1;
1835 }
1836 return BaseT::getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
1837 AddrSpace);
1838}
1839
1841 const TTI::LSRCost &B) const {
1842 // Favor lower per-iteration work over preheader/setup costs.
1843 // AMDGPU lacks rich addressing modes, so ScaleCost is folded into the
1844 // effective instruction count (base+scale*index requires a separate ADD).
1845 unsigned EffInsnsA = A.Insns + A.ScaleCost;
1846 unsigned EffInsnsB = B.Insns + B.ScaleCost;
1847
1848 return std::tie(EffInsnsA, A.NumIVMuls, A.AddRecCost, A.NumBaseAdds,
1849 A.SetupCost, A.ImmCost, A.NumRegs) <
1850 std::tie(EffInsnsB, B.NumIVMuls, B.AddRecCost, B.NumBaseAdds,
1851 B.SetupCost, B.ImmCost, B.NumRegs);
1852}
1853
1855 // isLSRCostLess de-prioritizes register count; keep consistent.
1856 return false;
1857}
1858
1860 // Prefer the baseline when LSR cannot clearly reduce per-iteration work.
1861 return true;
1862}
1863
1865 const SmallBitVector &UniformArgs) const {
1867 switch (Intrinsic->getIntrinsicID()) {
1868 case Intrinsic::amdgcn_wave_shuffle:
1869 // wave_shuffle(Value, Index): result is uniform when either Value or Index
1870 // is uniform.
1871 return UniformArgs[0] || UniformArgs[1];
1872 default:
1873 llvm_unreachable("unexpected intrinsic in isUniform");
1874 }
1875}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static cl::opt< unsigned > MemcpyLoopUnroll("amdgpu-memcpy-loop-unroll", cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " "operations when lowering statically-sized memcpy, memmove, or" "memset as a loop"), cl::init(16), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, const DataLayout &DL)
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static unsigned getNumElements(Type *Ty)
This file implements the SmallBitVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition Attributes.h:261
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
unsigned getNumberOfParts(Type *Tp) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool isInlineAsm() const
Check if this call is an inline asm statement.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
CallingConv::ID getCallingConv() const
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
This class represents a function call, abstracting a target machine's calling convention.
Conditional Branch instruction.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:572
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
ArrayRef< unsigned > getIndices() const
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool approxFunc() const
Definition FMF.h:73
Container class for subtarget features.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Account for loads of i8 vector types to have reduced cost.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
bool isUniform(const Instruction *I, const SmallBitVector &UniformArgs) const override
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
unsigned getNumberOfRegisters(unsigned RCID) const override
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool isLSRCostLess(const TTI::LSRCost &A, const TTI::LSRCost &B) const override
bool shouldPrefetchAddressSpace(unsigned AS) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool hasBranchDivergence(const Function *F=nullptr) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
unsigned getInliningThresholdMultiplier() const override
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
unsigned adjustInliningThreshold(const CallBase *CB) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool shouldDropLSRSolutionIfLessProfitable() const override
int getInliningLastCallToStaticBonus() const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
ValueUniformity getValueUniformity(const Value *V) const override
unsigned getNumberOfParts(Type *Tp) const override
When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
unsigned getMinVectorRegisterBitWidth() const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override
bool isNumRegsMajorCostOfLSR() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
LLVM_ABI bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Metadata node.
Definition Metadata.h:1080
Machine Value Type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
The optimization diagnostic interface.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::vector< AsmOperandInfo > AsmOperandInfoVector
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:312
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
Base class of all SIMD vector types.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
LLVM_ABI int getInstrCost()
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
auto m_Value()
Match an arbitrary value and ignore it.
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract_or_null(Y &&MD)
Extract a Value from Metadata, allowing null.
Definition Metadata.h:683
This is an optimization pass for GlobalISel generic memory operations.
@ Length
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
InstructionCost Cost
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ FAdd
Sum of floats.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
ValueUniformity
Enum describing how values behave with respect to uniformity and divergence, to answer the question: ...
Definition Uniformity.h:18
@ AlwaysUniform
The result value is always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result value can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result value is uniform if and only if all operands are uniform.
Definition Uniformity.h:20
@ Custom
The result value requires a custom uniformity check.
Definition Uniformity.h:31
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Parameters that control the generic loop unrolling transformation.
unsigned Threshold
The cost threshold for the unrolled loop.
bool UnrollVectorizedLoop
Disable runtime unrolling by default for vectorized loops.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...