LLVM 23.0.0git
GCNSubtarget.h
Go to the documentation of this file.
1//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// AMD GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16
17#include "AMDGPUCallLowering.h"
19#include "AMDGPUSubtarget.h"
20#include "SIFrameLowering.h"
21#include "SIISelLowering.h"
22#include "SIInstrInfo.h"
26
27#define GET_SUBTARGETINFO_HEADER
28#include "AMDGPUGenSubtargetInfo.inc"
29
30namespace llvm {
31
32class GCNTargetMachine;
33
34/// Module flag names controlling out-of-bounds buffer access semantics.
35/// Each flag is an i32 with Module::Max merge behaviour and tri-state values:
36/// 0 = any (absent/default - backend currently treats as strict)
37/// 1 = relaxed
38/// 2 = strict
39namespace AMDGPUOOBMode {
40inline constexpr StringLiteral BufferFlag("amdgpu.buffer.oob.mode");
41inline constexpr StringLiteral TBufferFlag("amdgpu.tbuffer.oob.mode");
42} // namespace AMDGPUOOBMode
43
45 public AMDGPUSubtarget {
46public:
48
49 // Following 2 enums are documented at:
50 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
51 enum class TrapHandlerAbi {
52 NONE = 0x00,
53 AMDHSA = 0x01,
54 };
55
56 enum class TrapID {
59 };
60
61private:
62 /// SelectionDAGISel related APIs.
63 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
64
65 /// GlobalISel related APIs.
66 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
67 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
68 std::unique_ptr<InstructionSelector> InstSelector;
69 std::unique_ptr<LegalizerInfo> Legalizer;
70 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
71
72protected:
73 // Basic subtarget description.
75 unsigned Gen = INVALID;
77 int LDSBankCount = 0;
79
80 // Instruction cache line size in bytes; set from TableGen subtarget features.
81 unsigned InstCacheLineSize = 0;
82
83 // Dynamically set bits that enable features.
84 bool DynamicVGPR = false;
86 bool ScalarizeGlobal = false;
87 const bool BufferOOBRelaxed;
89
90 /// The maximum number of instructions that may be placed within an S_CLAUSE,
91 /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
92 /// indicates a lack of S_CLAUSE support.
93 unsigned MaxHardClauseLength = 0;
94
95#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
96 bool ATTRIBUTE = DEFAULT;
97#include "AMDGPUGenSubtargetInfo.inc"
98
99private:
100 SIInstrInfo InstrInfo;
101 SITargetLowering TLInfo;
102 SIFrameLowering FrameLowering;
103
104 /// Get the register that represents the actual dependency between the
105 /// definition and the use. The definition might only affect a subregister
106 /// that is not actually used. Works for both virtual and physical registers.
107 /// Note: Currently supports VOP3P instructions (without WMMA an SWMMAC).
108 /// Returns the definition register if there is a real dependency and no
109 /// better match is found.
110 Register getRealSchedDependency(const MachineInstr &DefI, int DefOpIdx,
111 const MachineInstr &UseI, int UseOpIdx) const;
112
113public:
114 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
115 const GCNTargetMachine &TM, bool BufferOOBRelaxed = false,
116 bool TBufferOOBRelaxed = false);
117 ~GCNSubtarget() override;
118
120 StringRef FS);
121
122 /// Diagnose inconsistent subtarget features before attempting to codegen
123 /// function \p F.
124 void checkSubtargetFeatures(const Function &F) const;
125
126 const SIInstrInfo *getInstrInfo() const override { return &InstrInfo; }
127
128 const SIFrameLowering *getFrameLowering() const override {
129 return &FrameLowering;
130 }
131
132 const SITargetLowering *getTargetLowering() const override { return &TLInfo; }
133
134 const SIRegisterInfo *getRegisterInfo() const override {
135 return &InstrInfo.getRegisterInfo();
136 }
137
138 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
139
140 const CallLowering *getCallLowering() const override {
141 return CallLoweringInfo.get();
142 }
143
144 const InlineAsmLowering *getInlineAsmLowering() const override {
145 return InlineAsmLoweringInfo.get();
146 }
147
149 return InstSelector.get();
150 }
151
152 const LegalizerInfo *getLegalizerInfo() const override {
153 return Legalizer.get();
154 }
155
156 const AMDGPURegisterBankInfo *getRegBankInfo() const override {
157 return RegBankInfo.get();
158 }
159
160 const AMDGPU::TargetID &getTargetID() const { return TargetID; }
161
163 return &InstrItins;
164 }
165
167
169
170 bool isGFX11Plus() const { return getGeneration() >= GFX11; }
171
172#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
173 bool GETTER() const override { return ATTRIBUTE; }
174#include "AMDGPUGenSubtargetInfo.inc"
175
176 unsigned getMaxWaveScratchSize() const {
177 // See COMPUTE_TMPRING_SIZE.WAVESIZE.
178 if (getGeneration() >= GFX12) {
179 // 18-bit field in units of 64-dword.
180 return (64 * 4) * ((1 << 18) - 1);
181 }
182 if (getGeneration() == GFX11) {
183 // 15-bit field in units of 64-dword.
184 return (64 * 4) * ((1 << 15) - 1);
185 }
186 // 13-bit field in units of 256-dword.
187 return (256 * 4) * ((1 << 13) - 1);
188 }
189
190 /// Return the number of high bits known to be zero for a frame index.
194
195 int getLDSBankCount() const { return LDSBankCount; }
196
197 /// Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
198 unsigned getInstCacheLineSize() const { return InstCacheLineSize; }
199
200 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
201 return (ForBufferRSrc || !hasFlatScratchEnabled()) ? MaxPrivateElementSize
202 : 16;
203 }
204
205 unsigned getConstantBusLimit(unsigned Opcode) const;
206
207 /// Returns if the result of this instruction with a 16-bit result returned in
208 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
209 /// the original value.
210 bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
211
212 bool supportsWGP() const {
213 if (HasGFX1250Insts)
214 return false;
215 return getGeneration() >= GFX10;
216 }
217
218 bool hasHWFP64() const { return HasFP64; }
219
220 bool hasAddr64() const {
222 }
223
224 bool hasFlat() const {
226 }
227
228 // Return true if the target only has the reverse operand versions of VALU
229 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
230 bool hasOnlyRevVALUShifts() const {
232 }
233
234 bool hasFractBug() const { return getGeneration() == SOUTHERN_ISLANDS; }
235
236 bool hasMed3_16() const { return getGeneration() >= AMDGPUSubtarget::GFX9; }
237
238 bool hasMin3Max3_16() const {
240 }
241
242 bool hasSwap() const { return HasGFX9Insts; }
243
244 bool hasScalarPackInsts() const { return HasGFX9Insts; }
245
246 bool hasScalarMulHiInsts() const { return HasGFX9Insts; }
247
248 bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
249
250 bool hasAsyncMark() const { return hasVMemToLDSLoad() || HasAsynccnt; }
251
255
257 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
258 return getGeneration() >= GFX9;
259 }
260
261 /// True if the offset field of DS instructions works as expected. On SI, the
262 /// offset uses a 16-bit adder and does not always wrap properly.
263 bool hasUsableDSOffset() const { return getGeneration() >= SEA_ISLANDS; }
264
266 return EnableUnsafeDSOffsetFolding;
267 }
268
269 /// Condition output from div_scale is usable.
273
274 /// Extra wait hazard is needed in some cases before
275 /// s_cbranch_vccnz/s_cbranch_vccz.
276 bool hasReadVCCZBug() const { return getGeneration() <= SEA_ISLANDS; }
277
278 /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
279 bool partialVCCWritesUpdateVCCZ() const { return getGeneration() >= GFX10; }
280
281 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
282 /// was written by a VALU instruction.
285 }
286
287 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
288 /// SGPR was written by a VALU Instruction.
291 }
292
293 bool hasRFEHazards() const { return getGeneration() >= VOLCANIC_ISLANDS; }
294
295 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
296 unsigned getSetRegWaitStates() const {
297 return getGeneration() <= SEA_ISLANDS ? 1 : 2;
298 }
299
300 /// Return the amount of LDS that can be used that will not restrict the
301 /// occupancy lower than WaveCount.
302 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
303 const Function &) const;
304
307 }
308
309 /// \returns If target supports S_DENORM_MODE.
310 bool hasDenormModeInst() const {
312 }
313
314 /// \returns If target supports ds_read/write_b128 and user enables generation
315 /// of ds_read/write_b128.
316 bool useDS128() const { return HasCIInsts && EnableDS128; }
317
318 /// \return If target supports ds_read/write_b96/128.
319 bool hasDS96AndDS128() const { return HasCIInsts; }
320
321 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
322 bool haveRoundOpsF64() const { return HasCIInsts; }
323
324 /// \returns If MUBUF instructions always perform range checking, even for
325 /// buffer resources used for private memory access.
329
330 /// \returns If target requires PRT Struct NULL support (zero result registers
331 /// for sparse texture support).
332 bool usePRTStrictNull() const { return EnablePRTStrictNull; }
333
335 return HasUnalignedBufferAccess && HasUnalignedAccessMode;
336 }
337
339 return HasUnalignedDSAccess && HasUnalignedAccessMode;
340 }
341
343 return HasUnalignedScratchAccess && HasUnalignedAccessMode;
344 }
345
346 bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); }
347
348 bool isTgSplitEnabled() const { return EnableTgSplit; }
349
352
353 bool isCuModeEnabled() const { return EnableCuMode; }
354
355 bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
356
357 bool hasFlatScrRegister() const { return hasFlatAddressSpace(); }
358
359 // Check if target supports ST addressing mode with FLAT scratch instructions.
360 // The ST addressing mode means no registers are used, either VGPR or SGPR,
361 // but only immediate offset is swizzled and added to the FLAT scratch base.
362 bool hasFlatScratchSTMode() const {
363 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
364 }
365
366 bool hasFlatScratchSVSMode() const { return HasGFX940Insts || HasGFX11Insts; }
367
369 return hasArchitectedFlatScratch() ||
370 (EnableFlatScratch && hasFlatScratchInsts());
371 }
372
373 bool hasGlobalAddTidInsts() const { return HasGFX10_BEncoding; }
374
375 bool hasAtomicCSub() const { return HasGFX10_BEncoding; }
376
377 bool hasExportInsts() const {
378 return !hasGFX940Insts() && !hasGFX1250Insts();
379 }
380
381 bool hasVINTERPEncoding() const {
382 return HasGFX11Insts && !hasGFX1250Insts();
383 }
384
385 // DS_ADD_F64/DS_ADD_RTN_F64
386 bool hasLdsAtomicAddF64() const {
387 return hasGFX90AInsts() || hasGFX1250Insts();
388 }
389
391 return getGeneration() >= GFX9;
392 }
393
394 bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; }
395
396 bool hasD16LoadStore() const { return getGeneration() >= GFX9; }
397
399 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
400 }
401
402 bool hasD16Images() const { return getGeneration() >= VOLCANIC_ISLANDS; }
403
404 /// Return if most LDS instructions have an m0 use that require m0 to be
405 /// initialized.
406 bool ldsRequiresM0Init() const { return getGeneration() < GFX9; }
407
408 // True if the hardware rewinds and replays GWS operations if a wave is
409 // preempted.
410 //
411 // If this is false, a GWS operation requires testing if a nack set the
412 // MEM_VIOL bit, and repeating if so.
413 bool hasGWSAutoReplay() const { return getGeneration() >= GFX9; }
414
415 /// \returns if target has ds_gws_sema_release_all instruction.
416 bool hasGWSSemaReleaseAll() const { return HasCIInsts; }
417
418 bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
419
420 bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
421
422 // Covers VS/PS/CS graphics shaders
423 bool isMesaGfxShader(const Function &F) const {
424 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
425 }
426
427 bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
428
429 bool hasAtomicFaddInsts() const {
430 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
431 }
432
434 return getGeneration() < SEA_ISLANDS;
435 }
436
437 bool hasInstPrefetch() const {
438 return getGeneration() == GFX10 || getGeneration() == GFX11;
439 }
440
441 bool hasPrefetch() const { return HasGFX12Insts; }
442
443 bool hasInstPrefSize() const { return isGFX11Plus(); }
444
445 void getInstPrefSizeArgs(uint32_t &Mask, uint32_t &Shift, uint32_t &Width,
446 uint32_t &CacheLineSize) const {
449 if (getGeneration() == GFX11) {
450 Mask = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE;
451 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_SHIFT;
452 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX11_INST_PREF_SIZE_WIDTH;
453 } else {
454 Mask = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE;
455 Shift = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_SHIFT;
456 Width = amdhsa::COMPUTE_PGM_RSRC3_GFX12_PLUS_INST_PREF_SIZE_WIDTH;
457 }
458 }
459
460 // Has s_cmpk_* instructions.
461 bool hasSCmpK() const { return getGeneration() < GFX12; }
462
463 // Scratch is allocated in 256 dword per wave blocks for the entire
464 // wavefront. When viewed from the perspective of an arbitrary workitem, this
465 // is 4-byte aligned.
466 //
467 // Only 4-byte alignment is really needed to access anything. Transformations
468 // on the pointer value itself may rely on the alignment / known low bits of
469 // the pointer. Set this to something above the minimum to avoid needing
470 // dynamic realignment in common cases.
471 Align getStackAlignment() const { return Align(16); }
472
473 bool enableMachineScheduler() const override { return true; }
474
475 bool useAA() const override;
476
477 bool enableSubRegLiveness() const override { return true; }
478
481
482 // static wrappers
483 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
484
485 // XXX - Why is this here if it isn't in the default pass set?
486 bool enableEarlyIfConversion() const override { return true; }
487
489 const SchedRegion &Region) const override;
490
492 const SchedRegion &Region) const override;
493
494 void mirFileLoaded(MachineFunction &MF) const override;
495
496 unsigned getMaxNumUserSGPRs() const {
497 return AMDGPU::getMaxNumUserSGPRs(*this);
498 }
499
500 bool useVGPRIndexMode() const;
501
502 bool hasScalarCompareEq64() const {
504 }
505
506 bool hasLDSFPAtomicAddF32() const { return HasGFX8Insts; }
507 bool hasLDSFPAtomicAddF64() const {
508 return HasGFX90AInsts || HasGFX1250Insts;
509 }
510
511 /// \returns true if the subtarget has the v_permlane64_b32 instruction.
512 bool hasPermLane64() const { return getGeneration() >= GFX11; }
513
514 /// \returns true if the subtarget supports the ds_swizzle rotate and FFT
515 /// swizzle modes (GFX9+).
516 bool hasDsSwizzleRotateMode() const { return getGeneration() >= GFX9; }
517
518 bool hasDPPRowShare() const {
519 return HasDPP && (HasGFX90AInsts || getGeneration() >= GFX10);
520 }
521
522 // Has V_PK_MOV_B32 opcode
523 bool hasPkMovB32() const { return HasGFX90AInsts; }
524
526 return getGeneration() >= GFX10 || hasGFX940Insts();
527 }
528
529 bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }
530
531 bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
532
533 unsigned getNSAMaxSize(bool HasSampler = false) const {
534 return AMDGPU::getNSAMaxSize(*this, HasSampler);
535 }
536
537 bool hasMadF16() const;
538
539 // Scalar and global loads support scale_offset bit.
540 bool hasScaleOffset() const { return HasGFX1250Insts; }
541
542 // FLAT GLOBAL VOffset is signed
543 bool hasSignedGVSOffset() const { return HasGFX1250Insts; }
544
546
548 return HasUserSGPRInit16Bug && isWave32();
549 }
550
554
555 // \returns true if the subtarget supports DWORDX3 load/store instructions.
556 bool hasDwordx3LoadStores() const { return HasCIInsts; }
557
561
566
569 }
570
573 }
574
576 return HasLDSMisalignedBug && !EnableCuMode;
577 }
578
579 // Shift amount of a 64 bit shift cannot be a highest allocated register
580 // if also at the end of the allocation block.
581 bool hasShift64HighRegBug() const { return HasGFX90AInsts; }
582
583 // Has one cycle hazard on transcendental instruction feeding a
584 // non transcendental VALU.
585 bool hasTransForwardingHazard() const { return HasGFX940Insts; }
586
587 // Has one cycle hazard on a VALU instruction partially writing dst with
588 // a shift of result bits feeding another VALU instruction.
589 bool hasDstSelForwardingHazard() const { return HasGFX940Insts; }
590
591 // Cannot use op_sel with v_dot instructions.
592 bool hasDOTOpSelHazard() const { return HasGFX940Insts || HasGFX11Insts; }
593
594 // Does not have HW interlocs for VALU writing and then reading SGPRs.
595 bool hasVDecCoExecHazard() const { return HasGFX940Insts; }
596
597 bool hasHardClauses() const { return MaxHardClauseLength > 0; }
598
600 return getGeneration() == GFX10;
601 }
602
603 bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
604
605 bool hasLdsDirect() const { return getGeneration() >= GFX11; }
606
607 bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
608
610 return getGeneration() == GFX11;
611 }
612
613 bool hasCvtScaleForwardingHazard() const { return HasGFX950Insts; }
614
615 // All GFX9 targets experience a fetch delay when an instruction at the start
616 // of a loop header is split by a 32-byte fetch window boundary, but GFX950
617 // is uniquely sensitive to this: the delay triggers further performance
618 // degradation beyond the fetch latency itself.
619 bool hasLoopHeadInstSplitSensitivity() const { return HasGFX950Insts; }
620
621 bool requiresCodeObjectV6() const { return RequiresCOV6; }
622
623 bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
624
625 bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
626
628 return HasGFX12Insts && !HasGFX1250Insts;
629 }
630
631 bool setRegModeNeedsVNOPs() const {
632 return HasGFX1250Insts && getGeneration() == GFX12;
633 }
634
635 /// Return if operations acting on VGPR tuples require even alignment.
636 bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }
637
638 /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
639 bool hasSPackHL() const { return HasGFX11Insts; }
640
641 /// Return true if the target has the V_CVT_PK_I16_F32/V_CVT_PK_U16_F32
642 /// instructions.
643 bool hasVCvtPkIU16F32() const { return HasGFX11Insts; }
644
645 /// Return true if the target's EXP instruction has the COMPR flag, which
646 /// affects the meaning of the EN (enable) bits.
647 bool hasCompressedExport() const { return !HasGFX11Insts; }
648
649 /// Return true if the target's EXP instruction supports the NULL export
650 /// target.
651 bool hasNullExportTarget() const { return !HasGFX11Insts; }
652
653 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
654
655 /// Return true if the target has the S_DELAY_ALU instruction.
656 bool hasDelayAlu() const { return HasGFX11Insts; }
657
658 /// Returns true if the target supports
659 /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
660 /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
661 bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); }
662
663 /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
664 /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
665 bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
666
667 /// \returns true if the target has packed f32 instructions that only read 32
668 /// bits from a scalar operand (SGPR or literal) and replicates the bits to
669 /// both channels.
671 return getGeneration() == GFX12 && HasGFX1250Insts;
672 }
673
674 bool hasAddPC64Inst() const { return HasGFX1250Insts; }
675
676 /// \returns true if the target supports expert scheduling mode 2 which relies
677 /// on the compiler to insert waits to avoid hazards between VMEM and VALU
678 /// instructions in some instances.
679 bool hasExpertSchedulingMode() const { return getGeneration() >= GFX12; }
680
681 /// \returns The maximum number of instructions that can be enclosed in an
682 /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
683 /// instruction.
684 unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
685
686 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
687 /// SGPRs
688 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
689
690 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
691 /// VGPRs
692 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
693 unsigned DynamicVGPRBlockSize) const;
694
695 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
696 /// be achieved when the only function running on a CU is \p F, each workgroup
697 /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
698 /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
699 /// range, so this returns a range as well.
700 ///
701 /// Note that occupancy can be affected by the scratch allocation as well, but
702 /// we do not have enough information to compute it.
703 std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
704 unsigned LDSSize = 0,
705 unsigned NumSGPRs = 0,
706 unsigned NumVGPRs = 0) const;
707
708 /// \returns true if the flat_scratch register should be initialized with the
709 /// pointer to the wave's scratch memory rather than a size and offset.
710 bool flatScratchIsPointer() const {
712 }
713
714 /// \returns true if the machine has merged shaders in which s0-s7 are
715 /// reserved by the hardware and user SGPRs start at s8
716 bool hasMergedShaders() const { return getGeneration() >= GFX9; }
717
718 // \returns true if the target supports the pre-NGG legacy geometry path.
719 bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
720
721 // \returns true if the target has split barriers feature
722 bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
723
724 // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
725 bool hasRrWGMode() const { return getGeneration() >= GFX12; }
726
727 /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
728 /// values.
729 bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
730
731 bool hasINVWBL2WaitCntRequirement() const { return HasGFX1250Insts; }
732
733 bool hasVOPD3() const { return HasGFX1250Insts; }
734
735 // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
736 bool hasPkMinMax3Insts() const { return HasGFX1250Insts; }
737
738 // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
739 bool hasSGetShaderCyclesInst() const { return HasGFX1250Insts; }
740
741 // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
742 // of sign-extending. Note that GFX1250 has not only fixed the bug but also
743 // extended VA to 57 bits.
745 return HasGFX12Insts && !HasGFX1250Insts;
746 }
747
748 // \returns true if the target needs to create a prolog for backward
749 // compatibility when preloading kernel arguments.
751 return hasKernargPreload() && !HasGFX1250Insts;
752 }
753
754 bool hasCondSubInsts() const { return HasGFX12Insts; }
755
756 bool hasSubClampInsts() const { return hasGFX10_3Insts(); }
757
758 bool hasFmaLegacy32Insts() const { return hasGFX10_3Insts(); }
759
760 /// \returns SGPR allocation granularity supported by the subtarget.
761 unsigned getSGPRAllocGranule() const {
763 }
764
765 /// \returns SGPR encoding granularity supported by the subtarget.
766 unsigned getSGPREncodingGranule() const {
768 }
769
770 /// \returns Total number of SGPRs supported by the subtarget.
771 unsigned getTotalNumSGPRs() const {
773 }
774
775 /// \returns Addressable number of SGPRs supported by the subtarget.
776 unsigned getAddressableNumSGPRs() const {
778 }
779
780 /// \returns Minimum number of SGPRs that meets the given number of waves per
781 /// execution unit requirement supported by the subtarget.
782 unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
783 return AMDGPU::IsaInfo::getMinNumSGPRs(*this, WavesPerEU);
784 }
785
786 /// \returns Maximum number of SGPRs that meets the given number of waves per
787 /// execution unit requirement supported by the subtarget.
788 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
789 return AMDGPU::IsaInfo::getMaxNumSGPRs(*this, WavesPerEU, Addressable);
790 }
791
792 /// \returns Reserved number of SGPRs. This is common
793 /// utility function called by MachineFunction and
794 /// Function variants of getReservedNumSGPRs.
795 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
796 /// \returns Reserved number of SGPRs for given machine function \p MF.
797 unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
798
799 /// \returns Reserved number of SGPRs for given function \p F.
800 unsigned getReservedNumSGPRs(const Function &F) const;
801
802 /// \returns Maximum number of preloaded SGPRs for the subtarget.
803 unsigned getMaxNumPreloadedSGPRs() const;
804
805 /// \returns max num SGPRs. This is the common utility
806 /// function called by MachineFunction and Function
807 /// variants of getMaxNumSGPRs.
808 unsigned getBaseMaxNumSGPRs(const Function &F,
809 std::pair<unsigned, unsigned> WavesPerEU,
810 unsigned PreloadedSGPRs,
811 unsigned ReservedNumSGPRs) const;
812
813 /// \returns Maximum number of SGPRs that meets number of waves per execution
814 /// unit requirement for function \p MF, or number of SGPRs explicitly
815 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
816 ///
817 /// \returns Value that meets number of waves per execution unit requirement
818 /// if explicitly requested value cannot be converted to integer, violates
819 /// subtarget's specifications, or does not meet number of waves per execution
820 /// unit requirement.
821 unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
822
823 /// \returns Maximum number of SGPRs that meets number of waves per execution
824 /// unit requirement for function \p F, or number of SGPRs explicitly
825 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
826 ///
827 /// \returns Value that meets number of waves per execution unit requirement
828 /// if explicitly requested value cannot be converted to integer, violates
829 /// subtarget's specifications, or does not meet number of waves per execution
830 /// unit requirement.
831 unsigned getMaxNumSGPRs(const Function &F) const;
832
833 /// \returns VGPR allocation granularity supported by the subtarget.
834 unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
835 return AMDGPU::IsaInfo::getVGPRAllocGranule(*this, DynamicVGPRBlockSize);
836 }
837
838 /// \returns VGPR encoding granularity supported by the subtarget.
839 unsigned getVGPREncodingGranule() const {
841 }
842
843 /// \returns Total number of VGPRs supported by the subtarget.
844 unsigned getTotalNumVGPRs() const {
846 }
847
848 /// \returns Addressable number of architectural VGPRs supported by the
849 /// subtarget.
853
854 /// \returns Addressable number of VGPRs supported by the subtarget.
855 unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
856 return AMDGPU::IsaInfo::getAddressableNumVGPRs(*this, DynamicVGPRBlockSize);
857 }
858
859 /// \returns the minimum number of VGPRs that will prevent achieving more than
860 /// the specified number of waves \p WavesPerEU.
861 unsigned getMinNumVGPRs(unsigned WavesPerEU,
862 unsigned DynamicVGPRBlockSize) const {
863 return AMDGPU::IsaInfo::getMinNumVGPRs(*this, WavesPerEU,
864 DynamicVGPRBlockSize);
865 }
866
867 /// \returns the maximum number of VGPRs that can be used and still achieved
868 /// at least the specified number of waves \p WavesPerEU.
869 unsigned getMaxNumVGPRs(unsigned WavesPerEU,
870 unsigned DynamicVGPRBlockSize) const {
871 return AMDGPU::IsaInfo::getMaxNumVGPRs(*this, WavesPerEU,
872 DynamicVGPRBlockSize);
873 }
874
875 /// \returns max num VGPRs. This is the common utility function
876 /// called by MachineFunction and Function variants of getMaxNumVGPRs.
877 unsigned
879 std::pair<unsigned, unsigned> NumVGPRBounds) const;
880
881 /// \returns Maximum number of VGPRs that meets number of waves per execution
882 /// unit requirement for function \p F, or number of VGPRs explicitly
883 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
884 ///
885 /// \returns Value that meets number of waves per execution unit requirement
886 /// if explicitly requested value cannot be converted to integer, violates
887 /// subtarget's specifications, or does not meet number of waves per execution
888 /// unit requirement.
889 unsigned getMaxNumVGPRs(const Function &F) const;
890
891 unsigned getMaxNumAGPRs(const Function &F) const { return getMaxNumVGPRs(F); }
892
893 /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
894 /// of waves per execution unit required for the function \p MF.
895 std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;
896
897 /// \returns Maximum number of VGPRs that meets number of waves per execution
898 /// unit requirement for function \p MF, or number of VGPRs explicitly
899 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
900 ///
901 /// \returns Value that meets number of waves per execution unit requirement
902 /// if explicitly requested value cannot be converted to integer, violates
903 /// subtarget's specifications, or does not meet number of waves per execution
904 /// unit requirement.
905 unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
906
907 bool supportsWave32() const { return getGeneration() >= GFX10; }
908
909 bool supportsWave64() const { return !hasGFX1250Insts() || HasGFX13Insts; }
910
911 bool isWave32() const { return getWavefrontSize() == 32; }
912
913 bool isWave64() const { return getWavefrontSize() == 64; }
914
915 /// Returns if the wavesize of this subtarget is known reliable. This is false
916 /// only for the a default target-cpu that does not have an explicit
917 /// +wavefrontsize target feature.
918 bool isWaveSizeKnown() const {
919 return hasFeature(AMDGPU::FeatureWavefrontSize32) ||
920 hasFeature(AMDGPU::FeatureWavefrontSize64);
921 }
922
924 return getRegisterInfo()->getBoolRC();
925 }
926
927 /// \returns Maximum number of work groups per compute unit supported by the
928 /// subtarget and limited by given \p FlatWorkGroupSize.
929 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
930 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(*this, FlatWorkGroupSize);
931 }
932
933 /// \returns Minimum flat work group size supported by the subtarget.
934 unsigned getMinFlatWorkGroupSize() const override {
936 }
937
938 /// \returns Maximum flat work group size supported by the subtarget.
939 unsigned getMaxFlatWorkGroupSize() const override {
941 }
942
943 /// \returns Number of waves per execution unit required to support the given
944 /// \p FlatWorkGroupSize.
945 unsigned
946 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
947 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(*this, FlatWorkGroupSize);
948 }
949
950 /// \returns Minimum number of waves per execution unit supported by the
951 /// subtarget.
952 unsigned getMinWavesPerEU() const override {
954 }
955
956 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
957 SDep &Dep,
958 const TargetSchedModel *SchedModel) const override;
959
960 // \returns true if it's beneficial on this subtarget for the scheduler to
961 // cluster stores as well as loads.
962 bool shouldClusterStores() const { return getGeneration() >= GFX11; }
963
964 // \returns the number of address arguments from which to enable MIMG NSA
965 // on supported architectures.
966 unsigned getNSAThreshold(const MachineFunction &MF) const;
967
968 // \returns true if the subtarget has a hazard requiring an "s_nop 0"
969 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
970 bool requiresNopBeforeDeallocVGPRs() const { return !HasGFX1250Insts; }
971
972 // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
973 // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
974 bool requiresWaitIdleBeforeGetReg() const { return HasGFX1250Insts; }
975
976 bool isDynamicVGPREnabled() const { return DynamicVGPR; }
977 unsigned getDynamicVGPRBlockSize() const {
978 return DynamicVGPRBlockSize32 ? 32 : 16;
979 }
980
982 // AMDGPU doesn't care if early-clobber and undef operands are allocated
983 // to the same register.
984 return false;
985 }
986
987 // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything
988 // and surronded by S_WAIT_ALU(0xFFE3).
990 return getGeneration() == GFX12;
991 }
992
993 // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
994 // read.
996 return HasGFX1250Insts && getGeneration() == GFX12;
997 }
998
999 // src_flat_scratch_hi cannot be used as a source in SALU producing a 64-bit
1000 // result.
1002 return HasGFX1250Insts && getGeneration() == GFX12;
1003 }
1004
1005 /// \returns true if the subtarget requires a wait for xcnt before VMEM
1006 /// accesses that must never be repeated in the event of a page fault/re-try.
1007 /// Atomic stores/rmw and all volatile accesses fall under this criteria.
1009 return HasGFX1250Insts;
1010 }
1011
1012 /// \returns the number of significant bits in the immediate field of the
1013 /// S_NOP instruction.
1014 unsigned getSNopBits() const {
1016 return 7;
1018 return 4;
1019 return 3;
1020 }
1021
1025
1027 return (getGeneration() <= AMDGPUSubtarget::GFX9 ||
1029 isWave32();
1030 }
1031
1032 /// Return true if real (non-fake) variants of True16 instructions using
1033 /// 16-bit registers should be code-generated. Fake True16 instructions are
1034 /// identical to non-fake ones except that they take 32-bit registers as
1035 /// operands and always use their low halves.
1036 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
1037 // supported and the support for fake True16 instructions is removed.
1038 bool useRealTrue16Insts() const {
1039 return hasTrue16BitInsts() && EnableRealTrue16Insts;
1040 }
1041
1043 return getGeneration() >= GFX10 || isTgSplitEnabled();
1044 }
1045};
1046
1048public:
1049 bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1050
1051 bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1052
1053 bool hasDispatchPtr() const { return DispatchPtr; }
1054
1055 bool hasQueuePtr() const { return QueuePtr; }
1056
1057 bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1058
1059 bool hasDispatchID() const { return DispatchID; }
1060
1061 bool hasFlatScratchInit() const { return FlatScratchInit; }
1062
1063 bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
1064
1065 unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1066
1067 unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1068
1069 unsigned getNumFreeUserSGPRs();
1070
1071 void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1072
1083
1084 // Returns the size in number of SGPRs for preload user SGPR field.
1086 switch (ID) {
1088 return 2;
1090 return 4;
1091 case DispatchPtrID:
1092 return 2;
1093 case QueuePtrID:
1094 return 2;
1096 return 2;
1097 case DispatchIdID:
1098 return 2;
1099 case FlatScratchInitID:
1100 return 2;
1102 return 1;
1103 }
1104 llvm_unreachable("Unknown UserSGPRID.");
1105 }
1106
1107 GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1108
1109private:
1110 const GCNSubtarget &ST;
1111
1112 // Private memory buffer
1113 // Compute directly in sgpr[0:1]
1114 // Other shaders indirect 64-bits at sgpr[0:1]
1115 bool ImplicitBufferPtr = false;
1116
1117 bool PrivateSegmentBuffer = false;
1118
1119 bool DispatchPtr = false;
1120
1121 bool QueuePtr = false;
1122
1123 bool KernargSegmentPtr = false;
1124
1125 bool DispatchID = false;
1126
1127 bool FlatScratchInit = false;
1128
1129 bool PrivateSegmentSize = false;
1130
1131 unsigned NumKernargPreloadSGPRs = 0;
1132
1133 unsigned NumUsedUserSGPRs = 0;
1134};
1135
1136} // end namespace llvm
1137
1138#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > EnableLoadStoreOpt("aarch64-enable-ldst-opt", cl::desc("Enable the load/store pair" " optimization pass"), cl::init(true), cl::Hidden)
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
Base class for AMDGPU specific classes of TargetSubtarget.
AMDHSA kernel descriptor definitions.
static bool hasFeature(StringRef Feature, const FeatureBitset &FeatureBits, ArrayRef< SubtargetFeatureKV > ProcFeatures)
#define F(x, y, z)
Definition MD5.cpp:54
Promote Memory to Register
Definition Mem2Reg.cpp:110
SI DAG Lowering interface definition.
Interface definition for SIInstrInfo.
static cl::opt< unsigned > CacheLineSize("cache-line-size", cl::init(0), cl::Hidden, cl::desc("Use this to override the target cache line size when " "specified by the user."))
unsigned getWavefrontSizeLog2() const
AMDGPUSubtarget(const Triple &TT)
unsigned getMaxWavesPerEU() const
unsigned getWavefrontSize() const
bool hasPrefetch() const
bool hasFlat() const
bool hasD16Images() const
InstrItineraryData InstrItins
bool useVGPRIndexMode() const
bool partialVCCWritesUpdateVCCZ() const
Writes to VCC_LO/VCC_HI update the VCCZ flag.
bool hasSwap() const
bool hasPkMinMax3Insts() const
bool hasD16LoadStore() const
bool hasMergedShaders() const
bool hasRrWGMode() const
bool hasScalarCompareEq64() const
int getLDSBankCount() const
bool hasOnlyRevVALUShifts() const
bool hasNonNSAEncoding() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasExpertSchedulingMode() const
void mirFileLoaded(MachineFunction &MF) const override
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool loadStoreOptEnabled() const
bool enableSubRegLiveness() const override
unsigned getSGPRAllocGranule() const
bool hasLdsAtomicAddF64() const
bool hasFlatLgkmVMemCountInOrder() const
bool flatScratchIsPointer() const
bool requiresWaitOnWorkgroupReleaseFence() const
bool hasShift64HighRegBug() const
unsigned MaxPrivateElementSize
bool unsafeDSOffsetFoldingEnabled() const
bool hasFPAtomicToDenormModeHazard() const
unsigned getAddressableNumArchVGPRs() const
bool vmemWriteNeedsExpWaitcnt() const
bool shouldClusterStores() const
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasUserSGPRInit16BugInWave32() const
unsigned getSGPREncodingGranule() const
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasFlatScratchHiInB64InstHazard() const
bool hasDstSelForwardingHazard() const
void setScalarizeGlobalBehavior(bool b)
bool hasFlatScratchEnabled() const
bool hasRelaxedBufferOOBMode() const
unsigned getSNopBits() const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool hasMultiDwordFlatScratchAddressing() const
bool hasFmaakFmamkF64Insts() const
bool hasDsSwizzleRotateMode() const
bool hasHWFP64() const
bool hasScaleOffset() const
bool hasDenormModeInst() const
bool hasCvtScaleForwardingHazard() const
unsigned getTotalNumVGPRs() const
unsigned getMinWavesPerEU() const override
bool hasUnalignedDSAccessEnabled() const
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM, bool BufferOOBRelaxed=false, bool TBufferOOBRelaxed=false)
const SIInstrInfo * getInstrInfo() const override
unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasVALUMaskWriteHazard() const
bool hasCondSubInsts() const
const InlineAsmLowering * getInlineAsmLowering() const override
unsigned getTotalNumSGPRs() const
const InstrItineraryData * getInstrItineraryData() const override
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const override
void overridePostRASchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
bool hasPkMovB32() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
Align getStackAlignment() const
bool privateMemoryResourceIsRangeChecked() const
bool hasScalarSubwordLoads() const
const bool BufferOOBRelaxed
bool hasMadF16() const
bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const
unsigned getInstCacheLineSize() const
Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
bool hasLoopHeadInstSplitSensitivity() const
bool hasDwordx3LoadStores() const
bool hasSignedScratchOffsets() const
bool hasGlobalAddTidInsts() const
bool hasFlatScrRegister() const
bool hasGetPCZeroExtension() const
bool hasPermLane64() const
bool requiresNopBeforeDeallocVGPRs() const
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool supportsGetDoorbellID() const
bool supportsWave32() const
bool isTgSplitEnabled() const
unsigned getMaxNumAGPRs(const Function &F) const
bool hasReadM0MovRelInterpHazard() const
bool isDynamicVGPREnabled() const
bool hasInstPrefSize() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasDOTOpSelHazard() const
bool hasLdsWaitVMSRC() const
const TargetRegisterClass * getBoolRC() const
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > NumVGPRBounds) const
bool hasFmaakFmamkF32Insts() const
bool hasMad64_32() const
InstructionSelector * getInstructionSelector() const override
unsigned getVGPREncodingGranule() const
bool hasHardClauses() const
bool useDS128() const
bool hasExtendedWaitCounts() const
bool d16PreservesUnusedBits() const
bool hasInstPrefetch() const
bool hasAddPC64Inst() const
unsigned maxHardClauseLength() const
bool isMesaGfxShader(const Function &F) const
bool hasExportInsts() const
bool hasVINTERPEncoding() const
const AMDGPURegisterBankInfo * getRegBankInfo() const override
bool hasLegacyGeometry() const
TrapHandlerAbi getTrapHandlerAbi() const
bool isCuModeEnabled() const
const SIFrameLowering * getFrameLowering() const override
bool hasDPPRowShare() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
unsigned getMaxNumPreloadedSGPRs() const
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
void overrideSchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
bool useVGPRBlockOpsForCSR() const
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
bool needsKernArgPreloadProlog() const
bool hasMin3Max3_16() const
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const
AMDGPU::TargetID TargetID
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
const SITargetLowering * getTargetLowering() const override
bool hasTransForwardingHazard() const
bool enableMachineScheduler() const override
bool hasLDSFPAtomicAddF64() const
unsigned getNSAThreshold(const MachineFunction &MF) const
bool getScalarizeGlobalBehavior() const
bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const
bool hasReadM0LdsDmaHazard() const
bool hasScalarSMulU64() const
const AMDGPU::TargetID & getTargetID() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasScratchBaseForwardingHazard() const
bool hasRelaxedTBufferOOBMode() const
bool hasScalarPackInsts() const
bool requiresDisjointEarlyClobberAndUndef() const override
bool hasVALUReadSGPRHazard() const
bool usePRTStrictNull() const
unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const
bool supportsWaveWideBPermute() const
bool hasMed3_16() const
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
bool hasUnalignedScratchAccessEnabled() const
bool hasNullExportTarget() const
Return true if the target's EXP instruction supports the NULL export target.
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
const bool TBufferOOBRelaxed
bool useAA() const override
bool isWave32() const
bool isGFX11Plus() const
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs, unsigned DynamicVGPRBlockSize) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
bool hasUnalignedBufferAccessEnabled() const
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
unsigned getMinFlatWorkGroupSize() const override
bool hasAsyncMark() const
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool supportsMinMaxDenormModes() const
bool supportsWave64() const
bool supportsBPermute() const
bool hasFlatScratchSVSMode() const
unsigned InstCacheLineSize
bool supportsWGP() const
bool hasAtomicFaddInsts() const
bool hasSubClampInsts() const
bool requiresWaitXCntForSingleAccessInstructions() const
unsigned getNSAMaxSize(bool HasSampler=false) const
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
bool hasVOP3DPP() const
void getInstPrefSizeArgs(uint32_t &Mask, uint32_t &Shift, uint32_t &Width, uint32_t &CacheLineSize) const
unsigned getMaxFlatWorkGroupSize() const override
unsigned getMaxNumUserSGPRs() const
unsigned MaxHardClauseLength
The maximum number of instructions that may be placed within an S_CLAUSE, which is one greater than t...
bool hasFlatScratchSVSSwizzleBug() const
bool hasVDecCoExecHazard() const
bool hasSignedGVSOffset() const
bool hasLDSFPAtomicAddF32() const
unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDelayAlu() const
Return true if the target has the S_DELAY_ALU instruction.
bool hasReadM0SendMsgHazard() const
bool hasScalarMulHiInsts() const
bool hasSCmpK() const
bool hasVCvtPkIU16F32() const
Return true if the target has the V_CVT_PK_I16_F32/V_CVT_PK_U16_F32 instructions.
const LegalizerInfo * getLegalizerInfo() const override
bool requiresWaitIdleBeforeGetReg() const
bool hasDS96AndDS128() const
bool hasFmaLegacy32Insts() const
bool hasReadM0LdsDirectHazard() const
static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI)
Generation getGeneration() const
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
std::pair< unsigned, unsigned > getMaxNumVectorRegs(const Function &F) const
Return a pair of maximum numbers of VGPRs and AGPRs that meet the number of waves per execution unit ...
bool isXNACKEnabled() const
bool hasScalarAddSub64() const
bool hasSplitBarriers() const
bool enableEarlyIfConversion() const override
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
bool hasSGetShaderCyclesInst() const
bool hasINVWBL2WaitCntRequirement() const
bool hasRFEHazards() const
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
bool hasFlatScratchSTMode() const
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
bool hasGWSSemaReleaseAll() const
bool hasAddr64() const
unsigned getAddressableNumSGPRs() const
bool hasReadVCCZBug() const
Extra wait hazard is needed in some cases before s_cbranch_vccnz/s_cbranch_vccz.
bool isWave64() const
unsigned getDynamicVGPRBlockSize() const
bool setRegModeNeedsVNOPs() const
bool hasFractBug() const
bool isPreciseMemoryEnabled() const
unsigned getMaxWaveScratchSize() const
bool hasLDSMisalignedBugInWGPMode() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
~GCNSubtarget() override
const SelectionDAGTargetInfo * getSelectionDAGInfo() const override
bool hasVOPD3() const
bool hasAtomicCSub() const
bool requiresCodeObjectV6() const
const CallLowering * getCallLowering() const override
bool hasLdsDirect() const
bool hasGWSAutoReplay() const
static unsigned getNumUserSGPRForField(UserSGPRID ID)
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
bool hasPrivateSegmentBuffer() const
unsigned getNumKernargPreloadSGPRs() const
unsigned getNumUsedUserSGPRs() const
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST)
Itinerary data supplied by a subtarget to be used by a target.
Scheduling dependency.
Definition ScheduleDAG.h:51
const TargetRegisterClass * getBoolRC() const
Scheduling unit. This is a node in the scheduling DAG.
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:888
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Provide an instruction scheduling machine model to CodeGen passes.
TargetSubtargetInfo - Generic base class for all target subtargets.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Module flag names controlling out-of-bounds buffer access semantics.
constexpr StringLiteral BufferFlag("amdgpu.buffer.oob.mode")
constexpr StringLiteral TBufferFlag("amdgpu.tbuffer.oob.mode")
unsigned getSGPRAllocGranule(const MCSubtargetInfo &STI)
unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo &STI)
unsigned getTotalNumSGPRs(const MCSubtargetInfo &STI)
unsigned getAddressableNumSGPRs(const MCSubtargetInfo &STI)
unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo &STI)
unsigned getVGPREncodingGranule(const MCSubtargetInfo &STI, std::optional< bool > EnableWavefrontSize32)
unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo &STI, unsigned FlatWorkGroupSize)
unsigned getMinNumSGPRs(const MCSubtargetInfo &STI, unsigned WavesPerEU)
unsigned getMaxNumSGPRs(const MCSubtargetInfo &STI, unsigned WavesPerEU, bool Addressable)
unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo &STI, unsigned FlatWorkGroupSize)
constexpr unsigned getMaxFlatWorkGroupSize()
unsigned getSGPREncodingGranule(const MCSubtargetInfo &STI)
unsigned getTotalNumVGPRs(const MCSubtargetInfo &STI)
unsigned getMinNumVGPRs(const MCSubtargetInfo &STI, unsigned WavesPerEU, unsigned DynamicVGPRBlockSize)
unsigned getAddressableNumVGPRs(const MCSubtargetInfo &STI, unsigned DynamicVGPRBlockSize)
unsigned getMaxNumVGPRs(const MCSubtargetInfo &STI, unsigned WavesPerEU, unsigned DynamicVGPRBlockSize)
unsigned getVGPRAllocGranule(const MCSubtargetInfo &STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getMinWavesPerEU(const MCSubtargetInfo &STI)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:263
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
A region of an MBB for scheduling.