LLVM 23.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
18#include "AMDGPUTargetMachine.h"
25#include "llvm/IR/Function.h"
26#include "llvm/IR/IRBuilder.h"
27#include "llvm/IR/IntrinsicsAMDGPU.h"
31#include <optional>
32
33using namespace llvm;
34
35#define DEBUG_TYPE "AMDGPUtti"
36
38 "amdgpu-unroll-threshold-private",
39 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
40 cl::init(2700), cl::Hidden);
41
43 "amdgpu-unroll-threshold-local",
44 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
45 cl::init(1000), cl::Hidden);
46
48 "amdgpu-unroll-threshold-if",
49 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
50 cl::init(200), cl::Hidden);
51
53 "amdgpu-unroll-runtime-local",
54 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
55 cl::init(true), cl::Hidden);
56
58 "amdgpu-unroll-max-block-to-analyze",
59 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
60 cl::init(32), cl::Hidden);
61
62static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
63 cl::Hidden, cl::init(4000),
64 cl::desc("Cost of alloca argument"));
65
66// If the amount of scratch memory to eliminate exceeds our ability to allocate
67// it into registers we gain nothing by aggressively inlining functions for that
68// heuristic.
70 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
71 cl::init(256),
72 cl::desc("Maximum alloca size to use for inline cost"));
73
74// Inliner constraint to achieve reasonable compilation time.
76 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
77 cl::desc("Maximum number of BBs allowed in a function after inlining"
78 " (compile time constraint)"));
79
80// This default unroll factor is based on microbenchmarks on gfx1030.
82 "amdgpu-memcpy-loop-unroll",
83 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
84 "operations when lowering statically-sized memcpy, memmove, or"
85 "memset as a loop"),
86 cl::init(16), cl::Hidden);
87
88static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
89 unsigned Depth = 0) {
91 if (!I)
92 return false;
93
94 for (const Value *V : I->operand_values()) {
95 if (!L->contains(I))
96 continue;
97 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
98 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
99 return SubLoop->contains(PHI); }))
100 return true;
101 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
102 return true;
103 }
104 return false;
105}
106
108 : BaseT(TM, F.getDataLayout()),
109 TargetTriple(TM->getTargetTriple()),
110 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
111 TLI(ST->getTargetLowering()) {}
112
115 OptimizationRemarkEmitter *ORE) const {
116 const Function &F = *L->getHeader()->getParent();
117 UP.Threshold =
118 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
119 UP.MaxCount = std::numeric_limits<unsigned>::max();
120 UP.Partial = true;
121
122 // Conditional branch in a loop back edge needs 3 additional exec
123 // manipulations in average.
124 UP.BEInsns += 3;
125
126 // We want to run unroll even for the loops which have been vectorized.
127 UP.UnrollVectorizedLoop = true;
128
129 // TODO: Do we want runtime unrolling?
130
131 // Maximum alloca size than can fit registers. Reserve 16 registers.
132 const unsigned MaxAlloca = (256 - 16) * 4;
133 unsigned ThresholdPrivate = UnrollThresholdPrivate;
134 unsigned ThresholdLocal = UnrollThresholdLocal;
135
136 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
137 // provided threshold value as the default for Threshold
138 if (MDNode *LoopUnrollThreshold =
139 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
140 if (LoopUnrollThreshold->getNumOperands() == 2) {
142 LoopUnrollThreshold->getOperand(1));
143 if (MetaThresholdValue) {
144 // We will also use the supplied value for PartialThreshold for now.
145 // We may introduce additional metadata if it becomes necessary in the
146 // future.
147 UP.Threshold = MetaThresholdValue->getSExtValue();
149 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
150 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
151 }
152 }
153 }
154
155 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
156 for (const BasicBlock *BB : L->getBlocks()) {
157 const DataLayout &DL = BB->getDataLayout();
158 unsigned LocalGEPsSeen = 0;
159
160 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
161 return SubLoop->contains(BB); }))
162 continue; // Block belongs to an inner loop.
163
164 for (const Instruction &I : *BB) {
165 // Unroll a loop which contains an "if" statement whose condition
166 // defined by a PHI belonging to the loop. This may help to eliminate
167 // if region and potentially even PHI itself, saving on both divergence
168 // and registers used for the PHI.
169 // Add a small bonus for each of such "if" statements.
170 if (const CondBrInst *Br = dyn_cast<CondBrInst>(&I)) {
171 if (UP.Threshold < MaxBoost) {
172 BasicBlock *Succ0 = Br->getSuccessor(0);
173 BasicBlock *Succ1 = Br->getSuccessor(1);
174 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
175 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
176 continue;
177 if (dependsOnLocalPhi(L, Br->getCondition())) {
179 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
180 << " for loop:\n"
181 << *L << " due to " << *Br << '\n');
182 if (UP.Threshold >= MaxBoost)
183 return;
184 }
185 }
186 continue;
187 }
188
190 if (!GEP)
191 continue;
192
193 unsigned AS = GEP->getAddressSpace();
194 unsigned Threshold = 0;
196 Threshold = ThresholdPrivate;
198 Threshold = ThresholdLocal;
199 else
200 continue;
201
202 if (UP.Threshold >= Threshold)
203 continue;
204
205 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
206 const Value *Ptr = GEP->getPointerOperand();
207 const AllocaInst *Alloca =
209 if (!Alloca || !Alloca->isStaticAlloca())
210 continue;
211 auto AllocaSize = Alloca->getAllocationSize(DL);
212 if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca)
213 continue;
214 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
216 LocalGEPsSeen++;
217 // Inhibit unroll for local memory if we have seen addressing not to
218 // a variable, most likely we will be unable to combine it.
219 // Do not unroll too deep inner loops for local memory to give a chance
220 // to unroll an outer loop for a more important reason.
221 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
222 (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
223 !isa<Argument>(GEP->getPointerOperand())))
224 continue;
225 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
226 << *L << " due to LDS use.\n");
228 }
229
230 // Check if GEP depends on a value defined by this loop itself.
231 bool HasLoopDef = false;
232 for (const Value *Op : GEP->operands()) {
233 const Instruction *Inst = dyn_cast<Instruction>(Op);
234 if (!Inst || L->isLoopInvariant(Op))
235 continue;
236
237 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
238 return SubLoop->contains(Inst); }))
239 continue;
240 HasLoopDef = true;
241 break;
242 }
243 if (!HasLoopDef)
244 continue;
245
246 // We want to do whatever we can to limit the number of alloca
247 // instructions that make it through to the code generator. allocas
248 // require us to use indirect addressing, which is slow and prone to
249 // compiler bugs. If this loop does an address calculation on an
250 // alloca ptr, then we want to use a higher than normal loop unroll
251 // threshold. This will give SROA a better chance to eliminate these
252 // allocas.
253 //
254 // We also want to have more unrolling for local memory to let ds
255 // instructions with different offsets combine.
256 //
257 // Don't use the maximum allowed value here as it will make some
258 // programs way too big.
259 UP.Threshold = Threshold;
260 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
261 << " for loop:\n"
262 << *L << " due to " << *GEP << '\n');
263 if (UP.Threshold >= MaxBoost)
264 return;
265 }
266
267 // If we got a GEP in a small BB from inner loop then increase max trip
268 // count to analyze for better estimation cost in unroll
269 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
271 }
272 // If a user provided an explicit unroll pragma (with or without count),
273 // override expensive trip count checks
274 UnrollPragmaInfo PInfo(L);
275 if (PInfo.PragmaEnableUnroll || PInfo.PragmaCount > 0)
276 UP.AllowExpensiveTripCount = true;
277}
278
283
287
288const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
289 // Codegen control options which don't matter.
290 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
291 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal,
292 AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureUnalignedAccessMode,
293
294 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
295
296 // Property of the kernel/environment which can't actually differ.
297 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
298 AMDGPU::FeatureTrapHandler,
299
300 // The default assumption needs to be ecc is enabled, but no directly
301 // exposed operations depend on it, so it can be safely inlined.
302 AMDGPU::FeatureSRAMECC,
303
304 // Perf-tuning features
305 AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops};
306
308 : BaseT(TM, F.getDataLayout()),
309 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
310 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
311 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
313 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
314 HasFP64FP16Denormals =
315 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
316}
317
319 return !F || !ST->isSingleLaneExecution(*F);
320}
321
322unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
323 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
324 // registers. See getRegisterClassForType for the implementation.
325 // In this case vector registers are not vector in terms of
326 // VGPRs, but those which can hold multiple values.
327
328 // This is really the number of registers to fill when vectorizing /
329 // interleaving loops, so we lie to avoid trying to use all registers.
330 return 4;
331}
332
335 switch (K) {
337 return TypeSize::getFixed(32);
339 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
341 return TypeSize::getScalable(0);
342 }
343 llvm_unreachable("Unsupported register kind");
344}
345
347 return 32;
348}
349
350unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
351 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
352 return 32 * 4 / ElemWidth;
353 // For a given width return the max 0number of elements that can be combined
354 // into a wider bit value:
355 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
356 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
357 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
358 : 1;
359}
360
361unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
362 unsigned ChainSizeInBytes,
363 VectorType *VecTy) const {
364 unsigned VecRegBitWidth = VF * LoadSize;
365 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
366 // TODO: Support element-size less than 32bit?
367 return 128 / LoadSize;
368
369 return VF;
370}
371
372unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
373 unsigned ChainSizeInBytes,
374 VectorType *VecTy) const {
375 unsigned VecRegBitWidth = VF * StoreSize;
376 if (VecRegBitWidth > 128)
377 return 128 / StoreSize;
378
379 return VF;
380}
381
382unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
383 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
384 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
386 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
387 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
389 return 512;
390 }
391
392 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
393 return 8 * ST->getMaxPrivateElementSize();
394
395 // Common to flat, global, local and region. Assume for unknown addrspace.
396 return 128;
397}
398
399bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
400 Align Alignment,
401 unsigned AddrSpace) const {
402 // We allow vectorization of flat stores, even though we may need to decompose
403 // them later if they may access private memory. We don't have enough context
404 // here, and legalization can handle it.
405 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
406 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
407 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
408 }
409 return true;
410}
411
412bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
413 Align Alignment,
414 unsigned AddrSpace) const {
415 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
416}
417
418bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
419 Align Alignment,
420 unsigned AddrSpace) const {
421 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
422}
423
427
429 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
430 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
431 std::optional<uint32_t> AtomicElementSize) const {
432
433 if (AtomicElementSize)
434 return Type::getIntNTy(Context, *AtomicElementSize * 8);
435
436 // 16-byte accesses achieve the highest copy throughput.
437 // If the operation has a fixed known length that is large enough, it is
438 // worthwhile to return an even wider type and let legalization lower it into
439 // multiple accesses, effectively unrolling the memcpy loop.
440 // We also rely on legalization to decompose into smaller accesses for
441 // subtargets and address spaces where it is necessary.
442 //
443 // Don't unroll if Length is not a constant, since unrolling leads to worse
444 // performance for length values that are smaller or slightly larger than the
445 // total size of the type returned here. Mitigating that would require a more
446 // complex lowering for variable-length memcpy and memmove.
447 unsigned I32EltsInVector = 4;
450 MemcpyLoopUnroll * I32EltsInVector);
451
452 return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector);
453}
454
456 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
457 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
458 Align SrcAlign, Align DestAlign,
459 std::optional<uint32_t> AtomicCpySize) const {
460
461 if (AtomicCpySize)
463 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
464 DestAlign, AtomicCpySize);
465
466 Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
467 while (RemainingBytes >= 16) {
468 OpsOut.push_back(I32x4Ty);
469 RemainingBytes -= 16;
470 }
471
472 Type *I64Ty = Type::getInt64Ty(Context);
473 while (RemainingBytes >= 8) {
474 OpsOut.push_back(I64Ty);
475 RemainingBytes -= 8;
476 }
477
478 Type *I32Ty = Type::getInt32Ty(Context);
479 while (RemainingBytes >= 4) {
480 OpsOut.push_back(I32Ty);
481 RemainingBytes -= 4;
482 }
483
484 Type *I16Ty = Type::getInt16Ty(Context);
485 while (RemainingBytes >= 2) {
486 OpsOut.push_back(I16Ty);
487 RemainingBytes -= 2;
488 }
489
490 Type *I8Ty = Type::getInt8Ty(Context);
491 while (RemainingBytes) {
492 OpsOut.push_back(I8Ty);
493 --RemainingBytes;
494 }
495}
496
498 // Disable unrolling if the loop is not vectorized.
499 // TODO: Enable this again.
500 if (VF.isScalar())
501 return 1;
502
503 return 8;
504}
505
507 MemIntrinsicInfo &Info) const {
508 switch (Inst->getIntrinsicID()) {
509 case Intrinsic::amdgcn_ds_ordered_add:
510 case Intrinsic::amdgcn_ds_ordered_swap: {
511 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
512 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
513 if (!Ordering || !Volatile)
514 return false; // Invalid.
515
516 unsigned OrderingVal = Ordering->getZExtValue();
517 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
518 return false;
519
520 Info.PtrVal = Inst->getArgOperand(0);
521 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
522 Info.ReadMem = true;
523 Info.WriteMem = true;
524 Info.IsVolatile = !Volatile->isZero();
525 return true;
526 }
527 default:
528 return false;
529 }
530}
531
533 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
535 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
536
537 // Legalize the type.
538 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
539 int ISD = TLI->InstructionOpcodeToISD(Opcode);
540
541 // Because we don't have any legal vector operations, but the legal types, we
542 // need to account for split vectors.
543 unsigned NElts = LT.second.isVector() ?
544 LT.second.getVectorNumElements() : 1;
545
546 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
547
548 switch (ISD) {
549 case ISD::SHL:
550 case ISD::SRL:
551 case ISD::SRA:
552 if (SLT == MVT::i64)
553 return get64BitInstrCost(CostKind) * LT.first * NElts;
554
555 if (ST->has16BitInsts() && SLT == MVT::i16)
556 NElts = (NElts + 1) / 2;
557
558 // i32
559 return getFullRateInstrCost() * LT.first * NElts;
560 case ISD::ADD:
561 case ISD::SUB:
562 case ISD::AND:
563 case ISD::OR:
564 case ISD::XOR:
565 if (SLT == MVT::i64) {
566 // and, or and xor are typically split into 2 VALU instructions.
567 return 2 * getFullRateInstrCost() * LT.first * NElts;
568 }
569
570 if (ST->has16BitInsts() && SLT == MVT::i16)
571 NElts = (NElts + 1) / 2;
572
573 return LT.first * NElts * getFullRateInstrCost();
574 case ISD::MUL: {
575 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
576 if (SLT == MVT::i64) {
577 const int FullRateCost = getFullRateInstrCost();
578 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
579 }
580
581 if (ST->has16BitInsts() && SLT == MVT::i16)
582 NElts = (NElts + 1) / 2;
583
584 // i32
585 return QuarterRateCost * NElts * LT.first;
586 }
587 case ISD::FMUL:
588 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
589 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
590 // fused operation.
591 if (CxtI && CxtI->hasOneUse())
592 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
593 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
594 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
595 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
597 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
599
600 // Estimate all types may be fused with contract/unsafe flags
601 const TargetOptions &Options = TLI->getTargetMachine().Options;
602 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
603 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
605 }
606 }
607 [[fallthrough]];
608 case ISD::FADD:
609 case ISD::FSUB:
610 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
611 NElts = (NElts + 1) / 2;
612 if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
613 NElts = (NElts + 1) / 2;
614 if (SLT == MVT::f64)
615 return LT.first * NElts * get64BitInstrCost(CostKind);
616
617 if (ST->has16BitInsts() && SLT == MVT::f16)
618 NElts = (NElts + 1) / 2;
619
620 if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
621 return LT.first * NElts * getFullRateInstrCost();
622 break;
623 case ISD::FDIV:
624 case ISD::FREM:
625 // FIXME: frem should be handled separately. The fdiv in it is most of it,
626 // but the current lowering is also not entirely correct.
627 if (SLT == MVT::f64) {
628 int Cost = 7 * get64BitInstrCost(CostKind) +
629 getQuarterRateInstrCost(CostKind) +
630 3 * getHalfRateInstrCost(CostKind);
631 // Add cost of workaround.
632 if (!ST->hasUsableDivScaleConditionOutput())
633 Cost += 3 * getFullRateInstrCost();
634
635 return LT.first * Cost * NElts;
636 }
637
638 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
639 // TODO: This is more complicated, unsafe flags etc.
640 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
641 (SLT == MVT::f16 && ST->has16BitInsts())) {
642 return LT.first * getQuarterRateInstrCost(CostKind) * NElts;
643 }
644 }
645
646 if (SLT == MVT::f16 && ST->has16BitInsts()) {
647 // 2 x v_cvt_f32_f16
648 // f32 rcp
649 // f32 fmul
650 // v_cvt_f16_f32
651 // f16 div_fixup
652 int Cost =
653 4 * getFullRateInstrCost() + 2 * getQuarterRateInstrCost(CostKind);
654 return LT.first * Cost * NElts;
655 }
656
657 if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
658 // Fast unsafe fdiv lowering:
659 // f32 rcp
660 // f32 fmul
661 int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
662 return LT.first * Cost * NElts;
663 }
664
665 if (SLT == MVT::f32 || SLT == MVT::f16) {
666 // 4 more v_cvt_* insts without f16 insts support
667 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
668 1 * getQuarterRateInstrCost(CostKind);
669
670 if (!HasFP32Denormals) {
671 // FP mode switches.
672 Cost += 2 * getFullRateInstrCost();
673 }
674
675 return LT.first * NElts * Cost;
676 }
677 break;
678 case ISD::FNEG:
679 // Use the backend' estimation. If fneg is not free each element will cost
680 // one additional instruction.
681 return TLI->isFNegFree(SLT) ? 0 : NElts;
682 default:
683 break;
684 }
685
686 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
687 Args, CxtI);
688}
689
690// Return true if there's a potential benefit from using v2f16/v2i16
691// instructions for an intrinsic, even if it requires nontrivial legalization.
693 switch (ID) {
694 case Intrinsic::fma:
695 case Intrinsic::fmuladd:
696 case Intrinsic::copysign:
697 case Intrinsic::minimumnum:
698 case Intrinsic::maximumnum:
699 case Intrinsic::canonicalize:
700 // There's a small benefit to using vector ops in the legalized code.
701 case Intrinsic::round:
702 case Intrinsic::uadd_sat:
703 case Intrinsic::usub_sat:
704 case Intrinsic::sadd_sat:
705 case Intrinsic::ssub_sat:
706 case Intrinsic::abs:
707 return true;
708 default:
709 return false;
710 }
711}
712
716 switch (ICA.getID()) {
717 case Intrinsic::fabs:
718 // Free source modifier in the common case.
719 return 0;
720 case Intrinsic::amdgcn_workitem_id_x:
721 case Intrinsic::amdgcn_workitem_id_y:
722 case Intrinsic::amdgcn_workitem_id_z:
723 // TODO: If hasPackedTID, or if the calling context is not an entry point
724 // there may be a bit instruction.
725 return 0;
726 case Intrinsic::amdgcn_workgroup_id_x:
727 case Intrinsic::amdgcn_workgroup_id_y:
728 case Intrinsic::amdgcn_workgroup_id_z:
729 case Intrinsic::amdgcn_lds_kernel_id:
730 case Intrinsic::amdgcn_dispatch_ptr:
731 case Intrinsic::amdgcn_dispatch_id:
732 case Intrinsic::amdgcn_implicitarg_ptr:
733 case Intrinsic::amdgcn_queue_ptr:
734 // Read from an argument register.
735 return 0;
736 default:
737 break;
738 }
739
740 Type *RetTy = ICA.getReturnType();
741
742 Intrinsic::ID IID = ICA.getID();
743 switch (IID) {
744 case Intrinsic::exp:
745 case Intrinsic::exp2:
746 case Intrinsic::exp10: {
747 // Legalize the type.
748 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
749 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
750 unsigned NElts =
751 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
752
753 if (SLT == MVT::f64) {
754 unsigned NumOps = 20;
755 if (IID == Intrinsic::exp)
756 ++NumOps;
757 else if (IID == Intrinsic::exp10)
758 NumOps += 3;
759
760 return LT.first * NElts * NumOps * get64BitInstrCost(CostKind);
761 }
762
763 if (SLT == MVT::f32) {
764 unsigned NumFullRateOps = 0;
765 // v_exp_f32 (quarter rate).
766 unsigned NumQuarterRateOps = 1;
767
768 if (!ICA.getFlags().approxFunc() && IID != Intrinsic::exp2) {
769 // Non-AFN exp/exp10: range reduction + v_exp_f32 + ldexp +
770 // overflow/underflow checks (lowerFEXP). Denorm is also handled.
771 // FMA preamble: ~13 full-rate ops; non-FMA: ~17.
772 NumFullRateOps = ST->hasFastFMAF32() ? 13 : 17;
773 } else {
774 if (IID == Intrinsic::exp) {
775 // lowerFEXPUnsafe: fmul (base conversion) + v_exp_f32.
776 NumFullRateOps = 1;
777 } else if (IID == Intrinsic::exp10) {
778 // lowerFEXP10Unsafe: 3 fmul + 2 v_exp_f32 (double-exp2).
779 NumFullRateOps = 3;
780 NumQuarterRateOps = 2;
781 }
782 // Denorm scaling adds setcc + select + fadd + select + fmul.
783 if (HasFP32Denormals)
784 NumFullRateOps += 5;
785 }
786
788 NumFullRateOps * getFullRateInstrCost() +
789 NumQuarterRateOps * getQuarterRateInstrCost(CostKind);
790 return LT.first * NElts * Cost;
791 }
792
793 break;
794 }
795 default:
796 break;
797 }
798
801
802 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
803 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
804 unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
805
806 if ((ST->hasVOP3PInsts() &&
807 (SLT == MVT::f16 || SLT == MVT::i16 ||
808 (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
809 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
810 NElts = (NElts + 1) / 2;
811
812 // TODO: Get more refined intrinsic costs?
813 unsigned InstRate = getQuarterRateInstrCost(CostKind);
814
815 switch (ICA.getID()) {
816 case Intrinsic::fma:
817 case Intrinsic::fmuladd:
818 if (SLT == MVT::f64) {
819 InstRate = get64BitInstrCost(CostKind);
820 break;
821 }
822
823 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
824 InstRate = getFullRateInstrCost();
825 else {
826 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
827 : getQuarterRateInstrCost(CostKind);
828 }
829 break;
830 case Intrinsic::copysign:
831 return NElts * getFullRateInstrCost();
832 case Intrinsic::minimumnum:
833 case Intrinsic::maximumnum: {
834 // Instruction + 2 canonicalizes. For cases that need type promotion, we the
835 // promotion takes the place of the canonicalize.
836 unsigned NumOps = 3;
837 if (const IntrinsicInst *II = ICA.getInst()) {
838 // Directly legal with ieee=0
839 // TODO: Not directly legal with strictfp
841 NumOps = 1;
842 }
843
844 unsigned BaseRate =
845 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
846 InstRate = BaseRate * NumOps;
847 break;
848 }
849 case Intrinsic::canonicalize: {
850 InstRate =
851 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
852 break;
853 }
854 case Intrinsic::uadd_sat:
855 case Intrinsic::usub_sat:
856 case Intrinsic::sadd_sat:
857 case Intrinsic::ssub_sat: {
858 if (SLT == MVT::i16 || SLT == MVT::i32)
859 InstRate = getFullRateInstrCost();
860
861 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
862 if (any_of(ValidSatTys, equal_to(LT.second)))
863 NElts = 1;
864 break;
865 }
866 case Intrinsic::abs:
867 // Expansion takes 2 instructions for VALU
868 if (SLT == MVT::i16 || SLT == MVT::i32)
869 InstRate = 2 * getFullRateInstrCost();
870 break;
871 default:
872 break;
873 }
874
875 return LT.first * NElts * InstRate;
876}
877
880 const Instruction *I) const {
881 assert((I == nullptr || I->getOpcode() == Opcode) &&
882 "Opcode should reflect passed instruction.");
883 const bool SCost =
885 const int CBrCost = SCost ? 5 : 7;
886 switch (Opcode) {
887 case Instruction::UncondBr:
888 // Branch instruction takes about 4 slots on gfx900.
889 return SCost ? 1 : 4;
890 case Instruction::CondBr:
891 // Suppose conditional branch takes additional 3 exec manipulations
892 // instructions in average.
893 return CBrCost;
894 case Instruction::Switch: {
895 const auto *SI = dyn_cast_or_null<SwitchInst>(I);
896 // Each case (including default) takes 1 cmp + 1 cbr instructions in
897 // average.
898 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
899 }
900 case Instruction::Ret:
901 return SCost ? 1 : 10;
902 }
903 return BaseT::getCFInstrCost(Opcode, CostKind, I);
904}
905
908 std::optional<FastMathFlags> FMF,
911 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
912
913 EVT OrigTy = TLI->getValueType(DL, Ty);
914
915 // Computes cost on targets that have packed math instructions(which support
916 // 16-bit types only).
917 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
918 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
919
920 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
921 return LT.first * getFullRateInstrCost();
922}
923
926 FastMathFlags FMF,
928 EVT OrigTy = TLI->getValueType(DL, Ty);
929
930 // Computes cost on targets that have packed math instructions(which support
931 // 16-bit types only).
932 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
933 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
934
935 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
936 return LT.first * getHalfRateInstrCost(CostKind);
937}
938
940 unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
941 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
942 switch (Opcode) {
943 case Instruction::ExtractElement:
944 case Instruction::InsertElement: {
945 unsigned EltSize
946 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
947 if (EltSize < 32) {
948 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
949 return 0;
950 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
951 VIC);
952 }
953
954 // Extracts are just reads of a subregister, so are free. Inserts are
955 // considered free because we don't want to have any cost for scalarizing
956 // operations, and we don't have to copy into a different register class.
957
958 // Dynamic indexing isn't free and is best avoided.
959 return Index == ~0u ? 2 : 0;
960 }
961 default:
962 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
963 VIC);
964 }
965}
966
967/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
968/// this is analyzing the collective result of all output registers. Otherwise,
969/// this is only querying a specific result index if this returns multiple
970/// registers in a struct.
972 const CallInst *CI, ArrayRef<unsigned> Indices) const {
973 // TODO: Handle complex extract indices
974 if (Indices.size() > 1)
975 return true;
976
977 const DataLayout &DL = CI->getDataLayout();
978 const SIRegisterInfo *TRI = ST->getRegisterInfo();
979 TargetLowering::AsmOperandInfoVector TargetConstraints =
980 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
981
982 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
983
984 int OutputIdx = 0;
985 for (auto &TC : TargetConstraints) {
986 if (TC.Type != InlineAsm::isOutput)
987 continue;
988
989 // Skip outputs we don't care about.
990 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
991 continue;
992
993 TLI->ComputeConstraintToUse(TC, SDValue());
994
995 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
996 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
997
998 // For AGPR constraints null is returned on subtargets without AGPRs, so
999 // assume divergent for null.
1000 if (!RC || !TRI->isSGPRClass(RC))
1001 return true;
1002 }
1003
1004 return false;
1005}
1006
1008 const IntrinsicInst *ReadReg) const {
1009 Metadata *MD =
1010 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
1012 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
1013
1014 // Special case registers that look like VCC.
1015 MVT VT = MVT::getVT(ReadReg->getType());
1016 if (VT == MVT::i1)
1017 return true;
1018
1019 // Special case scalar registers that start with 'v'.
1020 if (RegName.starts_with("vcc") || RegName.empty())
1021 return false;
1022
1023 // VGPR or AGPR is divergent. There aren't any specially named vector
1024 // registers.
1025 return RegName[0] == 'v' || RegName[0] == 'a';
1026}
1027
1028/// \returns true if the result of the value could potentially be
1029/// different across workitems in a wavefront.
1030bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
1031 if (const Argument *A = dyn_cast<Argument>(V))
1033
1034 // Loads from the private and flat address spaces are divergent, because
1035 // threads can execute the load instruction with the same inputs and get
1036 // different results.
1037 //
1038 // All other loads are not divergent, because if threads issue loads with the
1039 // same arguments, they will always get the same result.
1040 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
1041 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
1042 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
1043
1044 // Atomics are divergent because they are executed sequentially: when an
1045 // atomic operation refers to the same address in each thread, then each
1046 // thread after the first sees the value written by the previous thread as
1047 // original value.
1049 return true;
1050
1052 Intrinsic::ID IID = Intrinsic->getIntrinsicID();
1053 switch (IID) {
1054 case Intrinsic::read_register:
1056 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1057 unsigned SrcAS =
1058 Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
1059 unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
1060 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
1061 DstAS == AMDGPUAS::FLAT_ADDRESS &&
1062 ST->hasGloballyAddressableScratch();
1063 }
1064 case Intrinsic::amdgcn_workitem_id_y:
1065 case Intrinsic::amdgcn_workitem_id_z: {
1066 const Function *F = Intrinsic->getFunction();
1067 bool HasUniformYZ =
1068 ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);
1069 std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
1070 *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1071 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1072 }
1073 default:
1075 }
1076 }
1077
1078 // Assume all function calls are a source of divergence.
1079 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1080 if (CI->isInlineAsm())
1082 return true;
1083 }
1084
1085 // Assume all function calls are a source of divergence.
1086 if (isa<InvokeInst>(V))
1087 return true;
1088
1089 // If the target supports globally addressable scratch, the mapping from
1090 // scratch memory to the flat aperture changes therefore an address space cast
1091 // is no longer uniform.
1092 if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {
1093 return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
1094 CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
1095 ST->hasGloballyAddressableScratch();
1096 }
1097
1098 return false;
1099}
1100
1101bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
1102 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
1103 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
1104
1105 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1106 if (CI->isInlineAsm())
1108 return false;
1109 }
1110
1111 // In most cases TID / wavefrontsize is uniform.
1112 //
1113 // However, if a kernel has uneven dimesions we can have a value of
1114 // workitem-id-x divided by the wavefrontsize non-uniform. For example
1115 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1116 // packed into a same wave which gives 1 and 0 after the division by 64
1117 // respectively.
1118 //
1119 // The X dimension doesn't reset within a wave if either both the Y
1120 // and Z dimensions are of length 1, or if the X dimension's required
1121 // size is a power of 2. Note, however, if the X dimension's maximum
1122 // size is a power of 2 < the wavefront size, division by the wavefront
1123 // size is guaranteed to yield 0, so this is also a no-reset case.
1124 bool XDimDoesntResetWithinWaves = false;
1125 if (auto *I = dyn_cast<Instruction>(V)) {
1126 const Function *F = I->getFunction();
1127 XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);
1128 }
1129 using namespace llvm::PatternMatch;
1130 uint64_t C;
1132 m_ConstantInt(C))) ||
1134 m_ConstantInt(C)))) {
1135 return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
1136 }
1137
1138 Value *Mask;
1140 m_Value(Mask)))) {
1141 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
1142 ST->getWavefrontSizeLog2() &&
1143 XDimDoesntResetWithinWaves;
1144 }
1145
1146 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
1147 if (!ExtValue)
1148 return false;
1149
1150 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
1151 if (!CI)
1152 return false;
1153
1154 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1155 switch (Intrinsic->getIntrinsicID()) {
1156 default:
1157 return false;
1158 case Intrinsic::amdgcn_if:
1159 case Intrinsic::amdgcn_else: {
1160 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1161 return Indices.size() == 1 && Indices[0] == 1;
1162 }
1163 }
1164 }
1165
1166 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1167 // divergent for the overall struct return. We need to override it in the
1168 // case we're extracting an SGPR component here.
1169 if (CI->isInlineAsm())
1170 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1171
1172 return false;
1173}
1174
1176 Intrinsic::ID IID) const {
1177 switch (IID) {
1178 case Intrinsic::amdgcn_is_shared:
1179 case Intrinsic::amdgcn_is_private:
1180 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1181 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1182 case Intrinsic::amdgcn_load_to_lds:
1183 case Intrinsic::amdgcn_make_buffer_rsrc:
1184 OpIndexes.push_back(0);
1185 return true;
1186 default:
1187 return false;
1188 }
1189}
1190
1192 Value *OldV,
1193 Value *NewV) const {
1194 auto IntrID = II->getIntrinsicID();
1195 switch (IntrID) {
1196 case Intrinsic::amdgcn_is_shared:
1197 case Intrinsic::amdgcn_is_private: {
1198 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1200 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1201 LLVMContext &Ctx = NewV->getType()->getContext();
1202 ConstantInt *NewVal = (TrueAS == NewAS) ?
1204 return NewVal;
1205 }
1206 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1207 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1208 Type *DestTy = II->getType();
1209 Type *SrcTy = NewV->getType();
1210 unsigned NewAS = SrcTy->getPointerAddressSpace();
1212 return nullptr;
1213 Module *M = II->getModule();
1215 M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
1216 II->setArgOperand(0, NewV);
1217 II->setCalledFunction(NewDecl);
1218 return II;
1219 }
1220 case Intrinsic::amdgcn_load_to_lds: {
1221 Type *SrcTy = NewV->getType();
1222 Module *M = II->getModule();
1223 Function *NewDecl =
1224 Intrinsic::getOrInsertDeclaration(M, II->getIntrinsicID(), {SrcTy});
1225 II->setArgOperand(0, NewV);
1226 II->setCalledFunction(NewDecl);
1227 return II;
1228 }
1229 case Intrinsic::amdgcn_make_buffer_rsrc: {
1230 Type *SrcTy = NewV->getType();
1231 Type *DstTy = II->getType();
1232 Module *M = II->getModule();
1234 M, II->getIntrinsicID(), {DstTy, SrcTy});
1235 II->setArgOperand(0, NewV);
1236 II->setCalledFunction(NewDecl);
1237 return II;
1238 }
1239 default:
1240 return nullptr;
1241 }
1242}
1243
1245 VectorType *DstTy, VectorType *SrcTy,
1246 ArrayRef<int> Mask,
1248 int Index, VectorType *SubTp,
1250 const Instruction *CxtI) const {
1251 if (!isa<FixedVectorType>(SrcTy))
1252 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1253 SubTp);
1254
1255 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1256
1257 unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());
1258 if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1259 (ScalarSize == 16 || ScalarSize == 8)) {
1260 // Larger vector widths may require additional instructions, but are
1261 // typically cheaper than scalarized versions.
1262 //
1263 // We assume that shuffling at a register granularity can be done for free.
1264 // This is not true for vectors fed into memory instructions, but it is
1265 // effectively true for all other shuffling. The emphasis of the logic here
1266 // is to assist generic transform in cleaning up / canonicalizing those
1267 // shuffles.
1268
1269 // With op_sel VOP3P instructions freely can access the low half or high
1270 // half of a register, so any swizzle of two elements is free.
1271 if (auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy)) {
1272 unsigned NumSrcElts = SrcVecTy->getNumElements();
1273 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
1274 (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
1275 Kind == TTI::SK_PermuteSingleSrc))
1276 return 0;
1277 }
1278
1279 unsigned EltsPerReg = 32 / ScalarSize;
1280 switch (Kind) {
1281 case TTI::SK_Broadcast:
1282 // A single v_perm_b32 can be re-used for all destination registers.
1283 return 1;
1284 case TTI::SK_Reverse:
1285 // One instruction per register.
1286 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1287 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1290 if (Index % EltsPerReg == 0)
1291 return 0; // Shuffling at register granularity
1292 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1293 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1296 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1297 if (!DstVecTy)
1299 unsigned NumDstElts = DstVecTy->getNumElements();
1300 unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements();
1301 unsigned EndIndex = Index + NumInsertElts;
1302 unsigned BeginSubIdx = Index % EltsPerReg;
1303 unsigned EndSubIdx = EndIndex % EltsPerReg;
1304 unsigned Cost = 0;
1305
1306 if (BeginSubIdx != 0) {
1307 // Need to shift the inserted vector into place. The cost is the number
1308 // of destination registers overlapped by the inserted vector.
1309 Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg);
1310 }
1311
1312 // If the last register overlap is partial, there may be three source
1313 // registers feeding into it; that takes an extra instruction.
1314 if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
1315 Cost += 1;
1316
1317 return Cost;
1318 }
1319 case TTI::SK_Splice: {
1320 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1321 if (!DstVecTy)
1323 unsigned NumElts = DstVecTy->getNumElements();
1324 assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
1325 // Determine the sub-region of the result vector that requires
1326 // sub-register shuffles / mixing.
1327 unsigned EltsFromLHS = NumElts - Index;
1328 bool LHSIsAligned = (Index % EltsPerReg) == 0;
1329 bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
1330 if (LHSIsAligned && RHSIsAligned)
1331 return 0;
1332 if (LHSIsAligned && !RHSIsAligned)
1333 return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);
1334 if (!LHSIsAligned && RHSIsAligned)
1335 return divideCeil(EltsFromLHS, EltsPerReg);
1336 return divideCeil(NumElts, EltsPerReg);
1337 }
1338 default:
1339 break;
1340 }
1341
1342 if (!Mask.empty()) {
1343 unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();
1344
1345 // Generically estimate the cost by assuming that each destination
1346 // register is derived from sources via v_perm_b32 instructions if it
1347 // can't be copied as-is.
1348 //
1349 // For each destination register, derive the cost of obtaining it based
1350 // on the number of source registers that feed into it.
1351 unsigned Cost = 0;
1352 for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
1354 bool Aligned = true;
1355 for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
1356 int SrcIdx = Mask[DstIdx + I];
1357 if (SrcIdx == -1)
1358 continue;
1359 int Reg;
1360 if (SrcIdx < (int)NumSrcElts) {
1361 Reg = SrcIdx / EltsPerReg;
1362 if (SrcIdx % EltsPerReg != I)
1363 Aligned = false;
1364 } else {
1365 Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
1366 if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
1367 Aligned = false;
1368 }
1369 if (!llvm::is_contained(Regs, Reg))
1370 Regs.push_back(Reg);
1371 }
1372 if (Regs.size() >= 2)
1373 Cost += Regs.size() - 1;
1374 else if (!Aligned)
1375 Cost += 1;
1376 }
1377 return Cost;
1378 }
1379 }
1380
1381 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1382 SubTp);
1383}
1384
1385/// Whether it is profitable to sink the operands of an
1386/// Instruction I to the basic block of I.
1387/// This helps using several modifiers (like abs and neg) more often.
1389 SmallVectorImpl<Use *> &Ops) const {
1390 using namespace PatternMatch;
1391
1392 for (auto &Op : I->operands()) {
1393 // Ensure we are not already sinking this operand.
1394 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
1395 continue;
1396
1397 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) {
1398 Ops.push_back(&Op);
1399 continue;
1400 }
1401
1402 // Check for zero-cost multiple use InsertElement/ExtractElement
1403 // instructions
1404 if (Instruction *OpInst = dyn_cast<Instruction>(Op.get())) {
1405 if (OpInst->getType()->isVectorTy() && OpInst->getNumOperands() > 1) {
1406 Instruction *VecOpInst = dyn_cast<Instruction>(OpInst->getOperand(0));
1407 if (VecOpInst && VecOpInst->hasOneUse())
1408 continue;
1409
1410 if (getVectorInstrCost(OpInst->getOpcode(), OpInst->getType(),
1412 OpInst->getOperand(0),
1413 OpInst->getOperand(1)) == 0) {
1414 Ops.push_back(&Op);
1415 continue;
1416 }
1417 }
1418 }
1419
1420 if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
1421
1422 unsigned EltSize = DL.getTypeSizeInBits(
1423 cast<VectorType>(Shuffle->getType())->getElementType());
1424
1425 // For i32 (or greater) shufflevectors, these will be lowered into a
1426 // series of insert / extract elements, which will be coalesced away.
1427 if (EltSize < 16 || !ST->has16BitInsts())
1428 continue;
1429
1430 int NumSubElts, SubIndex;
1431 if (Shuffle->changesLength()) {
1432 if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1433 Ops.push_back(&Op);
1434 continue;
1435 }
1436
1437 if ((Shuffle->isExtractSubvectorMask(SubIndex) ||
1438 Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) &&
1439 !(SubIndex & 0x1)) {
1440 Ops.push_back(&Op);
1441 continue;
1442 }
1443 }
1444
1445 if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1446 Shuffle->isSingleSource()) {
1447 Ops.push_back(&Op);
1448 continue;
1449 }
1450 }
1451 }
1452
1453 return !Ops.empty();
1454}
1455
1457 const Function *Callee) const {
1458 const TargetMachine &TM = getTLI()->getTargetMachine();
1459 const GCNSubtarget *CallerST
1460 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1461 const GCNSubtarget *CalleeST
1462 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1463
1464 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1465 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1466
1467 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1468 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1469 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1470 return false;
1471
1472 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1473 // no way to support merge for backend defined attributes.
1474 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1475 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1476 if (!CallerMode.isInlineCompatible(CalleeMode))
1477 return false;
1478
1479 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1480 Callee->hasFnAttribute(Attribute::InlineHint))
1481 return true;
1482
1483 // Hack to make compile times reasonable.
1484 if (InlineMaxBB) {
1485 // Single BB does not increase total BB amount.
1486 if (Callee->size() == 1)
1487 return true;
1488 size_t BBSize = Caller->size() + Callee->size() - 1;
1489 return BBSize <= InlineMaxBB;
1490 }
1491
1492 return true;
1493}
1494
1496 const SITargetLowering *TLI,
1497 const GCNTTIImpl *TTIImpl) {
1498 const int NrOfSGPRUntilSpill = 26;
1499 const int NrOfVGPRUntilSpill = 32;
1500
1501 const DataLayout &DL = TTIImpl->getDataLayout();
1502
1503 unsigned adjustThreshold = 0;
1504 int SGPRsInUse = 0;
1505 int VGPRsInUse = 0;
1506 for (const Use &A : CB->args()) {
1507 SmallVector<EVT, 4> ValueVTs;
1508 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1509 for (auto ArgVT : ValueVTs) {
1510 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1511 CB->getContext(), CB->getCallingConv(), ArgVT);
1513 SGPRsInUse += CCRegNum;
1514 else
1515 VGPRsInUse += CCRegNum;
1516 }
1517 }
1518
1519 // The cost of passing function arguments through the stack:
1520 // 1 instruction to put a function argument on the stack in the caller.
1521 // 1 instruction to take a function argument from the stack in callee.
1522 // 1 instruction is explicitly take care of data dependencies in callee
1523 // function.
1524 InstructionCost ArgStackCost(1);
1525 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1526 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1528 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1529 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1531
1532 // The penalty cost is computed relative to the cost of instructions and does
1533 // not model any storage costs.
1534 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1535 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1536 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1537 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1538 return adjustThreshold;
1539}
1540
1541static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1542 const DataLayout &DL) {
1543 // If we have a pointer to a private array passed into a function
1544 // it will not be optimized out, leaving scratch usage.
1545 // This function calculates the total size in bytes of the memory that would
1546 // end in scratch if the call was not inlined.
1547 unsigned AllocaSize = 0;
1549 for (Value *PtrArg : CB->args()) {
1550 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1551 if (!Ty)
1552 continue;
1553
1554 unsigned AddrSpace = Ty->getAddressSpace();
1555 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1556 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1557 continue;
1558
1560 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1561 continue;
1562
1563 if (auto Size = AI->getAllocationSize(DL))
1564 AllocaSize += Size->getFixedValue();
1565 }
1566 return AllocaSize;
1567}
1568
1573
1575 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
1576
1577 // Private object passed as arguments may end up in scratch usage if the call
1578 // is not inlined. Increase the inline threshold to promote inlining.
1579 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1580 if (AllocaSize > 0)
1581 Threshold += ArgAllocaCost;
1582 return Threshold;
1583}
1584
1586 const AllocaInst *AI) const {
1587
1588 // Below the cutoff, assume that the private memory objects would be
1589 // optimized
1590 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1591 if (AllocaSize <= ArgAllocaCutoff)
1592 return 0;
1593
1594 // Above the cutoff, we give a cost to each private memory object
1595 // depending its size. If the array can be optimized by SROA this cost is not
1596 // added to the total-cost in the inliner cost analysis.
1597 //
1598 // We choose the total cost of the alloca such that their sum cancels the
1599 // bonus given in the threshold (ArgAllocaCost).
1600 //
1601 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1602 //
1603 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1604 // the single-bb bonus and the vector-bonus.
1605 //
1606 // We compensate the first two multipliers, by repeating logic from the
1607 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1608 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1609 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1610
1611 bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
1612 return BB.getTerminator()->getNumSuccessors() > 1;
1613 });
1614 if (SingleBB) {
1615 Threshold += Threshold / 2;
1616 }
1617
1618 auto ArgAllocaSize = AI->getAllocationSize(DL);
1619 if (!ArgAllocaSize)
1620 return 0;
1621
1622 // Attribute the bonus proportionally to the alloca size
1623 unsigned AllocaThresholdBonus =
1624 (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize;
1625
1626 return AllocaThresholdBonus;
1627}
1628
1631 OptimizationRemarkEmitter *ORE) const {
1632 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1633}
1634
1636 TTI::PeelingPreferences &PP) const {
1637 CommonTTI.getPeelingPreferences(L, SE, PP);
1638}
1639
1640int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1641 return ST->hasFullRate64Ops()
1642 ? getFullRateInstrCost()
1643 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1644 : getQuarterRateInstrCost(CostKind);
1645}
1646
1647std::pair<InstructionCost, MVT>
1648GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1649 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1650 auto Size = DL.getTypeSizeInBits(Ty);
1651 // Maximum load or store can handle 8 dwords for scalar and 4 for
1652 // vector ALU. Let's assume anything above 8 dwords is expensive
1653 // even if legal.
1654 if (Size <= 256)
1655 return Cost;
1656
1657 Cost.first += (Size + 255) / 256;
1658 return Cost;
1659}
1660
1662 return ST->hasPrefetch() ? 128 : 0;
1663}
1664
1667}
1668
1670 const Function &F,
1671 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1672 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1673 LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1674 LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1675 LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1676 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1677 ST->getFlatWorkGroupSizes(F);
1678 LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1679 LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1680 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1681 LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1682 LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1683}
1684
1687 if (!ST->hasFeature(AMDGPU::FeatureDX10ClampAndIEEEMode))
1688 return KnownIEEEMode::On; // Only mode on gfx1170+
1689
1690 const Function *F = I.getFunction();
1691 if (!F)
1693
1694 Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");
1695 if (IEEEAttr.isValid())
1697
1698 return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
1700}
1701
1703 Align Alignment,
1704 unsigned AddressSpace,
1706 TTI::OperandValueInfo OpInfo,
1707 const Instruction *I) const {
1708 if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1709 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1710 VecTy->getElementType()->isIntegerTy(8)) {
1711 return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
1713 }
1714 }
1715 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1716 OpInfo, I);
1717}
1718
1720 if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1721 if (VecTy->getElementType()->isIntegerTy(8)) {
1722 unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1723 return divideCeil(ElementCount - 1, 4);
1724 }
1725 }
1726 return BaseT::getNumberOfParts(Tp);
1727}
1728
1731 if (isAlwaysUniform(V))
1733
1734 if (isSourceOfDivergence(V))
1736
1738}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
The AMDGPU TargetMachine interface definition for hw codegen targets.
static cl::opt< unsigned > MemcpyLoopUnroll("amdgpu-memcpy-loop-unroll", cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " "operations when lowering statically-sized memcpy, memmove, or" "memset as a loop"), cl::init(16), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, const DataLayout &DL)
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static unsigned getNumElements(Type *Ty)
#define LLVM_DEBUG(...)
Definition Debug.h:114
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition Attributes.h:261
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
unsigned getNumberOfParts(Type *Tp) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool isInlineAsm() const
Check if this call is an inline asm statement.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
CallingConv::ID getCallingConv() const
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
This class represents a function call, abstracting a target machine's calling convention.
Conditional Branch instruction.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
ArrayRef< unsigned > getIndices() const
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool approxFunc() const
Definition FMF.h:73
Container class for subtarget features.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Account for loads of i8 vector types to have reduced cost.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
unsigned getNumberOfRegisters(unsigned RCID) const override
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool shouldPrefetchAddressSpace(unsigned AS) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool hasBranchDivergence(const Function *F=nullptr) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
unsigned getInliningThresholdMultiplier() const override
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
unsigned adjustInliningThreshold(const CallBase *CB) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
int getInliningLastCallToStaticBonus() const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
unsigned getNumberOfParts(Type *Tp) const override
When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
unsigned getMinVectorRegisterBitWidth() const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
InstructionUniformity getInstructionUniformity(const Value *V) const override
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
LLVM_ABI bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Metadata node.
Definition Metadata.h:1080
Machine Value Type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
The optimization diagnostic interface.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::vector< AsmOperandInfo > AsmOperandInfoVector
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:294
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:295
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:300
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:403
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
Base class of all SIMD vector types.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
LLVM_ABI int getInstrCost()
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract_or_null(Y &&MD)
Extract a Value from Metadata, allowing null.
Definition Metadata.h:683
This is an optimization pass for GlobalISel generic memory operations.
Definition Types.h:26
@ Length
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
InstructionCost Cost
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ FAdd
Sum of floats.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Parameters that control the generic loop unrolling transformation.
unsigned Threshold
The cost threshold for the unrolled loop.
bool UnrollVectorizedLoop
Disable runtime unrolling by default for vectorized loops.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...
const unsigned PragmaCount
Definition UnrollLoop.h:131
const bool PragmaEnableUnroll
Definition UnrollLoop.h:132