LLVM 23.0.0git
GCNSubtarget.h
Go to the documentation of this file.
1//=====-- GCNSubtarget.h - Define GCN Subtarget for AMDGPU ------*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//==-----------------------------------------------------------------------===//
8//
9/// \file
10/// AMD GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
15#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
16
17#include "AMDGPUCallLowering.h"
19#include "AMDGPUSubtarget.h"
20#include "SIFrameLowering.h"
21#include "SIISelLowering.h"
22#include "SIInstrInfo.h"
25
26#define GET_SUBTARGETINFO_HEADER
27#include "AMDGPUGenSubtargetInfo.inc"
28
29namespace llvm {
30
31class GCNTargetMachine;
32
34 public AMDGPUSubtarget {
35public:
37
38 // Following 2 enums are documented at:
39 // - https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
40 enum class TrapHandlerAbi {
41 NONE = 0x00,
42 AMDHSA = 0x01,
43 };
44
45 enum class TrapID {
48 };
49
50private:
51 /// SelectionDAGISel related APIs.
52 std::unique_ptr<const SelectionDAGTargetInfo> TSInfo;
53
54 /// GlobalISel related APIs.
55 std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
56 std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
57 std::unique_ptr<InstructionSelector> InstSelector;
58 std::unique_ptr<LegalizerInfo> Legalizer;
59 std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
60
61protected:
62 // Basic subtarget description.
64 unsigned Gen = INVALID;
66 int LDSBankCount = 0;
68
69 // Instruction cache line size in bytes; set from TableGen subtarget features.
70 unsigned InstCacheLineSize = 0;
71
72 // Dynamically set bits that enable features.
73 bool DynamicVGPR = false;
75 bool ScalarizeGlobal = false;
76
77 /// The maximum number of instructions that may be placed within an S_CLAUSE,
78 /// which is one greater than the maximum argument to S_CLAUSE. A value of 0
79 /// indicates a lack of S_CLAUSE support.
80 unsigned MaxHardClauseLength = 0;
81
82#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
83 bool ATTRIBUTE = DEFAULT;
84#include "AMDGPUGenSubtargetInfo.inc"
85
86private:
87 SIInstrInfo InstrInfo;
88 SITargetLowering TLInfo;
89 SIFrameLowering FrameLowering;
90
91public:
92 GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
93 const GCNTargetMachine &TM);
94 ~GCNSubtarget() override;
95
97 StringRef FS);
98
99 /// Diagnose inconsistent subtarget features before attempting to codegen
100 /// function \p F.
101 void checkSubtargetFeatures(const Function &F) const;
102
103 const SIInstrInfo *getInstrInfo() const override { return &InstrInfo; }
104
105 const SIFrameLowering *getFrameLowering() const override {
106 return &FrameLowering;
107 }
108
109 const SITargetLowering *getTargetLowering() const override { return &TLInfo; }
110
111 const SIRegisterInfo *getRegisterInfo() const override {
112 return &InstrInfo.getRegisterInfo();
113 }
114
115 const SelectionDAGTargetInfo *getSelectionDAGInfo() const override;
116
117 const CallLowering *getCallLowering() const override {
118 return CallLoweringInfo.get();
119 }
120
121 const InlineAsmLowering *getInlineAsmLowering() const override {
122 return InlineAsmLoweringInfo.get();
123 }
124
126 return InstSelector.get();
127 }
128
129 const LegalizerInfo *getLegalizerInfo() const override {
130 return Legalizer.get();
131 }
132
133 const AMDGPURegisterBankInfo *getRegBankInfo() const override {
134 return RegBankInfo.get();
135 }
136
138 return TargetID;
139 }
140
142 return &InstrItins;
143 }
144
146
148
149 bool isGFX11Plus() const { return getGeneration() >= GFX11; }
150
151#define GET_SUBTARGETINFO_MACRO(ATTRIBUTE, DEFAULT, GETTER) \
152 bool GETTER() const override { return ATTRIBUTE; }
153#include "AMDGPUGenSubtargetInfo.inc"
154
155 unsigned getMaxWaveScratchSize() const {
156 // See COMPUTE_TMPRING_SIZE.WAVESIZE.
157 if (getGeneration() >= GFX12) {
158 // 18-bit field in units of 64-dword.
159 return (64 * 4) * ((1 << 18) - 1);
160 }
161 if (getGeneration() == GFX11) {
162 // 15-bit field in units of 64-dword.
163 return (64 * 4) * ((1 << 15) - 1);
164 }
165 // 13-bit field in units of 256-dword.
166 return (256 * 4) * ((1 << 13) - 1);
167 }
168
169 /// Return the number of high bits known to be zero for a frame index.
173
174 int getLDSBankCount() const { return LDSBankCount; }
175
176 /// Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
177 unsigned getInstCacheLineSize() const { return InstCacheLineSize; }
178
179 unsigned getMaxPrivateElementSize(bool ForBufferRSrc = false) const {
180 return (ForBufferRSrc || !hasFlatScratchEnabled()) ? MaxPrivateElementSize
181 : 16;
182 }
183
184 unsigned getConstantBusLimit(unsigned Opcode) const;
185
186 /// Returns if the result of this instruction with a 16-bit result returned in
187 /// a 32-bit register implicitly zeroes the high 16-bits, rather than preserve
188 /// the original value.
189 bool zeroesHigh16BitsOfDest(unsigned Opcode) const;
190
191 bool supportsWGP() const {
192 if (HasGFX1250Insts)
193 return false;
194 return getGeneration() >= GFX10;
195 }
196
197 bool hasHWFP64() const { return HasFP64; }
198
199 bool hasAddr64() const {
201 }
202
203 bool hasFlat() const {
205 }
206
207 // Return true if the target only has the reverse operand versions of VALU
208 // shift instructions (e.g. v_lshrrev_b32, and no v_lshr_b32).
209 bool hasOnlyRevVALUShifts() const {
211 }
212
213 bool hasFractBug() const { return getGeneration() == SOUTHERN_ISLANDS; }
214
215 bool hasMed3_16() const { return getGeneration() >= AMDGPUSubtarget::GFX9; }
216
217 bool hasMin3Max3_16() const {
219 }
220
221 bool hasSwap() const { return HasGFX9Insts; }
222
223 bool hasScalarPackInsts() const { return HasGFX9Insts; }
224
225 bool hasScalarMulHiInsts() const { return HasGFX9Insts; }
226
227 bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
228
229 bool hasAsyncMark() const { return hasVMemToLDSLoad() || HasAsynccnt; }
230
234
236 // The S_GETREG DOORBELL_ID is supported by all GFX9 onward targets.
237 return getGeneration() >= GFX9;
238 }
239
240 /// True if the offset field of DS instructions works as expected. On SI, the
241 /// offset uses a 16-bit adder and does not always wrap properly.
242 bool hasUsableDSOffset() const { return getGeneration() >= SEA_ISLANDS; }
243
245 return EnableUnsafeDSOffsetFolding;
246 }
247
248 /// Condition output from div_scale is usable.
252
253 /// Extra wait hazard is needed in some cases before
254 /// s_cbranch_vccnz/s_cbranch_vccz.
255 bool hasReadVCCZBug() const { return getGeneration() <= SEA_ISLANDS; }
256
257 /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
258 bool partialVCCWritesUpdateVCCZ() const { return getGeneration() >= GFX10; }
259
260 /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
261 /// was written by a VALU instruction.
264 }
265
266 /// A read of an SGPR by a VMEM instruction requires 5 wait states when the
267 /// SGPR was written by a VALU Instruction.
270 }
271
272 bool hasRFEHazards() const { return getGeneration() >= VOLCANIC_ISLANDS; }
273
274 /// Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
275 unsigned getSetRegWaitStates() const {
276 return getGeneration() <= SEA_ISLANDS ? 1 : 2;
277 }
278
279 /// Return the amount of LDS that can be used that will not restrict the
280 /// occupancy lower than WaveCount.
281 unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
282 const Function &) const;
283
286 }
287
288 /// \returns If target supports S_DENORM_MODE.
289 bool hasDenormModeInst() const {
291 }
292
293 /// \returns If target supports ds_read/write_b128 and user enables generation
294 /// of ds_read/write_b128.
295 bool useDS128() const { return HasCIInsts && EnableDS128; }
296
297 /// \return If target supports ds_read/write_b96/128.
298 bool hasDS96AndDS128() const { return HasCIInsts; }
299
300 /// Have v_trunc_f64, v_ceil_f64, v_rndne_f64
301 bool haveRoundOpsF64() const { return HasCIInsts; }
302
303 /// \returns If MUBUF instructions always perform range checking, even for
304 /// buffer resources used for private memory access.
308
309 /// \returns If target requires PRT Struct NULL support (zero result registers
310 /// for sparse texture support).
311 bool usePRTStrictNull() const { return EnablePRTStrictNull; }
312
314 return HasUnalignedBufferAccess && HasUnalignedAccessMode;
315 }
316
318 return HasUnalignedDSAccess && HasUnalignedAccessMode;
319 }
320
322 return HasUnalignedScratchAccess && HasUnalignedAccessMode;
323 }
324
325 bool isXNACKEnabled() const { return TargetID.isXnackOnOrAny(); }
326
327 bool isTgSplitEnabled() const { return EnableTgSplit; }
328
329 bool isCuModeEnabled() const { return EnableCuMode; }
330
331 bool isPreciseMemoryEnabled() const { return EnablePreciseMemory; }
332
333 bool hasFlatScrRegister() const { return hasFlatAddressSpace(); }
334
335 // Check if target supports ST addressing mode with FLAT scratch instructions.
336 // The ST addressing mode means no registers are used, either VGPR or SGPR,
337 // but only immediate offset is swizzled and added to the FLAT scratch base.
338 bool hasFlatScratchSTMode() const {
339 return hasFlatScratchInsts() && (hasGFX10_3Insts() || hasGFX940Insts());
340 }
341
342 bool hasFlatScratchSVSMode() const { return HasGFX940Insts || HasGFX11Insts; }
343
345 return hasArchitectedFlatScratch() ||
346 (EnableFlatScratch && hasFlatScratchInsts());
347 }
348
349 bool hasGlobalAddTidInsts() const { return HasGFX10_BEncoding; }
350
351 bool hasAtomicCSub() const { return HasGFX10_BEncoding; }
352
353 bool hasMTBUFInsts() const { return !hasGFX1250Insts(); }
354
355 bool hasFormattedMUBUFInsts() const { return !hasGFX1250Insts(); }
356
357 bool hasExportInsts() const {
358 return !hasGFX940Insts() && !hasGFX1250Insts();
359 }
360
361 bool hasVINTERPEncoding() const {
362 return HasGFX11Insts && !hasGFX1250Insts();
363 }
364
365 // DS_ADD_F64/DS_ADD_RTN_F64
366 bool hasLdsAtomicAddF64() const {
367 return hasGFX90AInsts() || hasGFX1250Insts();
368 }
369
371 return getGeneration() >= GFX9;
372 }
373
374 bool hasFlatLgkmVMemCountInOrder() const { return getGeneration() > GFX9; }
375
376 bool hasD16LoadStore() const { return getGeneration() >= GFX9; }
377
379 return hasD16LoadStore() && !TargetID.isSramEccOnOrAny();
380 }
381
382 bool hasD16Images() const { return getGeneration() >= VOLCANIC_ISLANDS; }
383
384 /// Return if most LDS instructions have an m0 use that require m0 to be
385 /// initialized.
386 bool ldsRequiresM0Init() const { return getGeneration() < GFX9; }
387
388 // True if the hardware rewinds and replays GWS operations if a wave is
389 // preempted.
390 //
391 // If this is false, a GWS operation requires testing if a nack set the
392 // MEM_VIOL bit, and repeating if so.
393 bool hasGWSAutoReplay() const { return getGeneration() >= GFX9; }
394
395 /// \returns if target has ds_gws_sema_release_all instruction.
396 bool hasGWSSemaReleaseAll() const { return HasCIInsts; }
397
398 bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
399
400 bool hasScalarSMulU64() const { return getGeneration() >= GFX12; }
401
402 // Covers VS/PS/CS graphics shaders
403 bool isMesaGfxShader(const Function &F) const {
404 return isMesa3DOS() && AMDGPU::isShader(F.getCallingConv());
405 }
406
407 bool hasMad64_32() const { return getGeneration() >= SEA_ISLANDS; }
408
409 bool hasAtomicFaddInsts() const {
410 return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
411 }
412
414 return getGeneration() < SEA_ISLANDS;
415 }
416
417 bool hasInstPrefetch() const {
418 return getGeneration() == GFX10 || getGeneration() == GFX11;
419 }
420
421 bool hasPrefetch() const { return HasGFX12Insts; }
422
423 // Has s_cmpk_* instructions.
424 bool hasSCmpK() const { return getGeneration() < GFX12; }
425
426 // Scratch is allocated in 256 dword per wave blocks for the entire
427 // wavefront. When viewed from the perspective of an arbitrary workitem, this
428 // is 4-byte aligned.
429 //
430 // Only 4-byte alignment is really needed to access anything. Transformations
431 // on the pointer value itself may rely on the alignment / known low bits of
432 // the pointer. Set this to something above the minimum to avoid needing
433 // dynamic realignment in common cases.
434 Align getStackAlignment() const { return Align(16); }
435
436 bool enableMachineScheduler() const override { return true; }
437
438 bool useAA() const override;
439
440 bool enableSubRegLiveness() const override { return true; }
441
444
445 // static wrappers
446 static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
447
448 // XXX - Why is this here if it isn't in the default pass set?
449 bool enableEarlyIfConversion() const override { return true; }
450
452 const SchedRegion &Region) const override;
453
455 const SchedRegion &Region) const override;
456
457 void mirFileLoaded(MachineFunction &MF) const override;
458
459 unsigned getMaxNumUserSGPRs() const {
460 return AMDGPU::getMaxNumUserSGPRs(*this);
461 }
462
463 bool useVGPRIndexMode() const;
464
465 bool hasScalarCompareEq64() const {
467 }
468
469 bool hasLDSFPAtomicAddF32() const { return HasGFX8Insts; }
470 bool hasLDSFPAtomicAddF64() const {
471 return HasGFX90AInsts || HasGFX1250Insts;
472 }
473
474 /// \returns true if the subtarget has the v_permlanex16_b32 instruction.
475 bool hasPermLaneX16() const { return getGeneration() >= GFX10; }
476
477 /// \returns true if the subtarget has the v_permlane64_b32 instruction.
478 bool hasPermLane64() const { return getGeneration() >= GFX11; }
479
480 bool hasDPPRowShare() const {
481 return HasDPP && (HasGFX90AInsts || getGeneration() >= GFX10);
482 }
483
484 // Has V_PK_MOV_B32 opcode
485 bool hasPkMovB32() const { return HasGFX90AInsts; }
486
488 return getGeneration() >= GFX10 || hasGFX940Insts();
489 }
490
491 bool hasFmaakFmamkF64Insts() const { return hasGFX1250Insts(); }
492
493 bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
494
495 unsigned getNSAMaxSize(bool HasSampler = false) const {
496 return AMDGPU::getNSAMaxSize(*this, HasSampler);
497 }
498
499 bool hasMadF16() const;
500
501 bool hasMovB64() const { return HasGFX940Insts || HasGFX1250Insts; }
502
503 // Scalar and global loads support scale_offset bit.
504 bool hasScaleOffset() const { return HasGFX1250Insts; }
505
506 // FLAT GLOBAL VOffset is signed
507 bool hasSignedGVSOffset() const { return HasGFX1250Insts; }
508
510
512 return HasUserSGPRInit16Bug && isWave32();
513 }
514
518
519 // \returns true if the subtarget supports DWORDX3 load/store instructions.
520 bool hasDwordx3LoadStores() const { return HasCIInsts; }
521
525
530
533 }
534
537 }
538
540 return HasLDSMisalignedBug && !EnableCuMode;
541 }
542
543 // Shift amount of a 64 bit shift cannot be a highest allocated register
544 // if also at the end of the allocation block.
545 bool hasShift64HighRegBug() const { return HasGFX90AInsts; }
546
547 // Has one cycle hazard on transcendental instruction feeding a
548 // non transcendental VALU.
549 bool hasTransForwardingHazard() const { return HasGFX940Insts; }
550
551 // Has one cycle hazard on a VALU instruction partially writing dst with
552 // a shift of result bits feeding another VALU instruction.
553 bool hasDstSelForwardingHazard() const { return HasGFX940Insts; }
554
555 // Cannot use op_sel with v_dot instructions.
556 bool hasDOTOpSelHazard() const { return HasGFX940Insts || HasGFX11Insts; }
557
558 // Does not have HW interlocs for VALU writing and then reading SGPRs.
559 bool hasVDecCoExecHazard() const { return HasGFX940Insts; }
560
561 bool hasHardClauses() const { return MaxHardClauseLength > 0; }
562
564 return getGeneration() == GFX10;
565 }
566
567 bool hasVOP3DPP() const { return getGeneration() >= GFX11; }
568
569 bool hasLdsDirect() const { return getGeneration() >= GFX11; }
570
571 bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
572
574 return getGeneration() == GFX11;
575 }
576
577 bool hasCvtScaleForwardingHazard() const { return HasGFX950Insts; }
578
579 // All GFX9 targets experience a fetch delay when an instruction at the start
580 // of a loop header is split by a 32-byte fetch window boundary, but GFX950
581 // is uniquely sensitive to this: the delay triggers further performance
582 // degradation beyond the fetch latency itself.
583 bool hasLoopHeadInstSplitSensitivity() const { return HasGFX950Insts; }
584
585 bool requiresCodeObjectV6() const { return RequiresCOV6; }
586
587 bool useVGPRBlockOpsForCSR() const { return UseBlockVGPROpsForCSR; }
588
589 bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
590
592 return HasGFX12Insts && !HasGFX1250Insts;
593 }
594
595 bool setRegModeNeedsVNOPs() const {
596 return HasGFX1250Insts && getGeneration() == GFX12;
597 }
598
599 /// Return if operations acting on VGPR tuples require even alignment.
600 bool needsAlignedVGPRs() const { return RequiresAlignVGPR; }
601
602 /// Return true if the target has the S_PACK_HL_B32_B16 instruction.
603 bool hasSPackHL() const { return HasGFX11Insts; }
604
605 /// Return true if the target's EXP instruction has the COMPR flag, which
606 /// affects the meaning of the EN (enable) bits.
607 bool hasCompressedExport() const { return !HasGFX11Insts; }
608
609 /// Return true if the target's EXP instruction supports the NULL export
610 /// target.
611 bool hasNullExportTarget() const { return !HasGFX11Insts; }
612
613 bool hasFlatScratchSVSSwizzleBug() const { return getGeneration() == GFX11; }
614
615 /// Return true if the target has the S_DELAY_ALU instruction.
616 bool hasDelayAlu() const { return HasGFX11Insts; }
617
618 /// Returns true if the target supports
619 /// global_load_lds_dwordx3/global_load_lds_dwordx4 or
620 /// buffer_load_dwordx3/buffer_load_dwordx4 with the lds bit.
621 bool hasLDSLoadB96_B128() const { return hasGFX950Insts(); }
622
623 /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
624 /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
625 bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
626
627 /// \returns true if the target has packed f32 instructions that only read 32
628 /// bits from a scalar operand (SGPR or literal) and replicates the bits to
629 /// both channels.
631 return getGeneration() == GFX12 && HasGFX1250Insts;
632 }
633
634 bool hasAddPC64Inst() const { return HasGFX1250Insts; }
635
636 /// \returns true if the target supports expert scheduling mode 2 which relies
637 /// on the compiler to insert waits to avoid hazards between VMEM and VALU
638 /// instructions in some instances.
639 bool hasExpertSchedulingMode() const { return getGeneration() >= GFX12; }
640
641 /// \returns The maximum number of instructions that can be enclosed in an
642 /// S_CLAUSE on the given subtarget, or 0 for targets that do not support that
643 /// instruction.
644 unsigned maxHardClauseLength() const { return MaxHardClauseLength; }
645
646 /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
647 /// SGPRs
648 unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
649
650 /// Return the maximum number of waves per SIMD for kernels using \p VGPRs
651 /// VGPRs
652 unsigned getOccupancyWithNumVGPRs(unsigned VGPRs,
653 unsigned DynamicVGPRBlockSize) const;
654
655 /// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
656 /// be achieved when the only function running on a CU is \p F, each workgroup
657 /// uses \p LDSSize bytes of LDS, and each wave uses \p NumSGPRs SGPRs and \p
658 /// NumVGPRs VGPRs. The flat workgroup sizes associated to the function are a
659 /// range, so this returns a range as well.
660 ///
661 /// Note that occupancy can be affected by the scratch allocation as well, but
662 /// we do not have enough information to compute it.
663 std::pair<unsigned, unsigned> computeOccupancy(const Function &F,
664 unsigned LDSSize = 0,
665 unsigned NumSGPRs = 0,
666 unsigned NumVGPRs = 0) const;
667
668 /// \returns true if the flat_scratch register should be initialized with the
669 /// pointer to the wave's scratch memory rather than a size and offset.
670 bool flatScratchIsPointer() const {
672 }
673
674 /// \returns true if the machine has merged shaders in which s0-s7 are
675 /// reserved by the hardware and user SGPRs start at s8
676 bool hasMergedShaders() const { return getGeneration() >= GFX9; }
677
678 // \returns true if the target supports the pre-NGG legacy geometry path.
679 bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
680
681 // \returns true if the target has split barriers feature
682 bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
683
684 // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
685 bool hasRrWGMode() const { return getGeneration() >= GFX12; }
686
687 /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
688 /// values.
689 bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
690
691 bool hasINVWBL2WaitCntRequirement() const { return HasGFX1250Insts; }
692
693 bool hasVOPD3() const { return HasGFX1250Insts; }
694
695 // \returns true if the target has V_MUL_U64/V_MUL_I64 instructions.
696 bool hasVectorMulU64() const { return HasGFX1250Insts; }
697
698 // \returns true if the target has V_MAD_NC_U64_U32/V_MAD_NC_I64_I32
699 // instructions.
700 bool hasMadU64U32NoCarry() const { return HasGFX1250Insts; }
701
702 // \returns true if the target has V_{MIN|MAX}_{I|U}64 instructions.
703 bool hasIntMinMax64() const { return HasGFX1250Insts; }
704
705 // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
706 bool hasPkMinMax3Insts() const { return HasGFX1250Insts; }
707
708 // \returns ture if target has S_GET_SHADER_CYCLES_U64 instruction.
709 bool hasSGetShaderCyclesInst() const { return HasGFX1250Insts; }
710
711 // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
712 // of sign-extending. Note that GFX1250 has not only fixed the bug but also
713 // extended VA to 57 bits.
715 return HasGFX12Insts && !HasGFX1250Insts;
716 }
717
718 // \returns true if the target needs to create a prolog for backward
719 // compatibility when preloading kernel arguments.
721 return hasKernargPreload() && !HasGFX1250Insts;
722 }
723
724 bool hasCondSubInsts() const { return HasGFX12Insts; }
725
726 bool hasSubClampInsts() const { return hasGFX10_3Insts(); }
727
728 /// \returns SGPR allocation granularity supported by the subtarget.
729 unsigned getSGPRAllocGranule() const {
731 }
732
733 /// \returns SGPR encoding granularity supported by the subtarget.
734 unsigned getSGPREncodingGranule() const {
736 }
737
738 /// \returns Total number of SGPRs supported by the subtarget.
739 unsigned getTotalNumSGPRs() const {
741 }
742
743 /// \returns Addressable number of SGPRs supported by the subtarget.
744 unsigned getAddressableNumSGPRs() const {
746 }
747
748 /// \returns Minimum number of SGPRs that meets the given number of waves per
749 /// execution unit requirement supported by the subtarget.
750 unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
751 return AMDGPU::IsaInfo::getMinNumSGPRs(this, WavesPerEU);
752 }
753
754 /// \returns Maximum number of SGPRs that meets the given number of waves per
755 /// execution unit requirement supported by the subtarget.
756 unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
757 return AMDGPU::IsaInfo::getMaxNumSGPRs(this, WavesPerEU, Addressable);
758 }
759
760 /// \returns Reserved number of SGPRs. This is common
761 /// utility function called by MachineFunction and
762 /// Function variants of getReservedNumSGPRs.
763 unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const;
764 /// \returns Reserved number of SGPRs for given machine function \p MF.
765 unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
766
767 /// \returns Reserved number of SGPRs for given function \p F.
768 unsigned getReservedNumSGPRs(const Function &F) const;
769
770 /// \returns Maximum number of preloaded SGPRs for the subtarget.
771 unsigned getMaxNumPreloadedSGPRs() const;
772
773 /// \returns max num SGPRs. This is the common utility
774 /// function called by MachineFunction and Function
775 /// variants of getMaxNumSGPRs.
776 unsigned getBaseMaxNumSGPRs(const Function &F,
777 std::pair<unsigned, unsigned> WavesPerEU,
778 unsigned PreloadedSGPRs,
779 unsigned ReservedNumSGPRs) const;
780
781 /// \returns Maximum number of SGPRs that meets number of waves per execution
782 /// unit requirement for function \p MF, or number of SGPRs explicitly
783 /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
784 ///
785 /// \returns Value that meets number of waves per execution unit requirement
786 /// if explicitly requested value cannot be converted to integer, violates
787 /// subtarget's specifications, or does not meet number of waves per execution
788 /// unit requirement.
789 unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
790
791 /// \returns Maximum number of SGPRs that meets number of waves per execution
792 /// unit requirement for function \p F, or number of SGPRs explicitly
793 /// requested using "amdgpu-num-sgpr" attribute attached to function \p F.
794 ///
795 /// \returns Value that meets number of waves per execution unit requirement
796 /// if explicitly requested value cannot be converted to integer, violates
797 /// subtarget's specifications, or does not meet number of waves per execution
798 /// unit requirement.
799 unsigned getMaxNumSGPRs(const Function &F) const;
800
801 /// \returns VGPR allocation granularity supported by the subtarget.
802 unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const {
803 return AMDGPU::IsaInfo::getVGPRAllocGranule(this, DynamicVGPRBlockSize);
804 }
805
806 /// \returns VGPR encoding granularity supported by the subtarget.
807 unsigned getVGPREncodingGranule() const {
809 }
810
811 /// \returns Total number of VGPRs supported by the subtarget.
812 unsigned getTotalNumVGPRs() const {
814 }
815
816 /// \returns Addressable number of architectural VGPRs supported by the
817 /// subtarget.
821
822 /// \returns Addressable number of VGPRs supported by the subtarget.
823 unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const {
824 return AMDGPU::IsaInfo::getAddressableNumVGPRs(this, DynamicVGPRBlockSize);
825 }
826
827 /// \returns the minimum number of VGPRs that will prevent achieving more than
828 /// the specified number of waves \p WavesPerEU.
829 unsigned getMinNumVGPRs(unsigned WavesPerEU,
830 unsigned DynamicVGPRBlockSize) const {
831 return AMDGPU::IsaInfo::getMinNumVGPRs(this, WavesPerEU,
832 DynamicVGPRBlockSize);
833 }
834
835 /// \returns the maximum number of VGPRs that can be used and still achieved
836 /// at least the specified number of waves \p WavesPerEU.
837 unsigned getMaxNumVGPRs(unsigned WavesPerEU,
838 unsigned DynamicVGPRBlockSize) const {
839 return AMDGPU::IsaInfo::getMaxNumVGPRs(this, WavesPerEU,
840 DynamicVGPRBlockSize);
841 }
842
843 /// \returns max num VGPRs. This is the common utility function
844 /// called by MachineFunction and Function variants of getMaxNumVGPRs.
845 unsigned
847 std::pair<unsigned, unsigned> NumVGPRBounds) const;
848
849 /// \returns Maximum number of VGPRs that meets number of waves per execution
850 /// unit requirement for function \p F, or number of VGPRs explicitly
851 /// requested using "amdgpu-num-vgpr" attribute attached to function \p F.
852 ///
853 /// \returns Value that meets number of waves per execution unit requirement
854 /// if explicitly requested value cannot be converted to integer, violates
855 /// subtarget's specifications, or does not meet number of waves per execution
856 /// unit requirement.
857 unsigned getMaxNumVGPRs(const Function &F) const;
858
859 unsigned getMaxNumAGPRs(const Function &F) const { return getMaxNumVGPRs(F); }
860
861 /// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
862 /// of waves per execution unit required for the function \p MF.
863 std::pair<unsigned, unsigned> getMaxNumVectorRegs(const Function &F) const;
864
865 /// \returns Maximum number of VGPRs that meets number of waves per execution
866 /// unit requirement for function \p MF, or number of VGPRs explicitly
867 /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
868 ///
869 /// \returns Value that meets number of waves per execution unit requirement
870 /// if explicitly requested value cannot be converted to integer, violates
871 /// subtarget's specifications, or does not meet number of waves per execution
872 /// unit requirement.
873 unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
874
875 bool supportsWave32() const { return getGeneration() >= GFX10; }
876
877 bool supportsWave64() const { return !hasGFX1250Insts(); }
878
879 bool isWave32() const { return getWavefrontSize() == 32; }
880
881 bool isWave64() const { return getWavefrontSize() == 64; }
882
883 /// Returns if the wavesize of this subtarget is known reliable. This is false
884 /// only for the a default target-cpu that does not have an explicit
885 /// +wavefrontsize target feature.
886 bool isWaveSizeKnown() const {
887 return hasFeature(AMDGPU::FeatureWavefrontSize32) ||
888 hasFeature(AMDGPU::FeatureWavefrontSize64);
889 }
890
892 return getRegisterInfo()->getBoolRC();
893 }
894
895 /// \returns Maximum number of work groups per compute unit supported by the
896 /// subtarget and limited by given \p FlatWorkGroupSize.
897 unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override {
898 return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(this, FlatWorkGroupSize);
899 }
900
901 /// \returns Minimum flat work group size supported by the subtarget.
902 unsigned getMinFlatWorkGroupSize() const override {
904 }
905
906 /// \returns Maximum flat work group size supported by the subtarget.
907 unsigned getMaxFlatWorkGroupSize() const override {
909 }
910
911 /// \returns Number of waves per execution unit required to support the given
912 /// \p FlatWorkGroupSize.
913 unsigned
914 getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
915 return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
916 }
917
918 /// \returns Minimum number of waves per execution unit supported by the
919 /// subtarget.
920 unsigned getMinWavesPerEU() const override {
922 }
923
924 void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
925 SDep &Dep,
926 const TargetSchedModel *SchedModel) const override;
927
928 // \returns true if it's beneficial on this subtarget for the scheduler to
929 // cluster stores as well as loads.
930 bool shouldClusterStores() const { return getGeneration() >= GFX11; }
931
932 // \returns the number of address arguments from which to enable MIMG NSA
933 // on supported architectures.
934 unsigned getNSAThreshold(const MachineFunction &MF) const;
935
936 // \returns true if the subtarget has a hazard requiring an "s_nop 0"
937 // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
938 bool requiresNopBeforeDeallocVGPRs() const { return !HasGFX1250Insts; }
939
940 // \returns true if the subtarget needs S_WAIT_ALU 0 before S_GETREG_B32 on
941 // STATUS, STATE_PRIV, EXCP_FLAG_PRIV, or EXCP_FLAG_USER.
942 bool requiresWaitIdleBeforeGetReg() const { return HasGFX1250Insts; }
943
944 bool isDynamicVGPREnabled() const { return DynamicVGPR; }
945 unsigned getDynamicVGPRBlockSize() const {
946 return DynamicVGPRBlockSize32 ? 32 : 16;
947 }
948
950 // AMDGPU doesn't care if early-clobber and undef operands are allocated
951 // to the same register.
952 return false;
953 }
954
955 // DS_ATOMIC_ASYNC_BARRIER_ARRIVE_B64 shall not be claused with anything
956 // and surronded by S_WAIT_ALU(0xFFE3).
958 return getGeneration() == GFX12;
959 }
960
961 // Requires s_wait_alu(0) after s102/s103 write and src_flat_scratch_base
962 // read.
964 return HasGFX1250Insts && getGeneration() == GFX12;
965 }
966
967 // src_flat_scratch_hi cannot be used as a source in SALU producing a 64-bit
968 // result.
970 return HasGFX1250Insts && getGeneration() == GFX12;
971 }
972
973 /// \returns true if the subtarget requires a wait for xcnt before VMEM
974 /// accesses that must never be repeated in the event of a page fault/re-try.
975 /// Atomic stores/rmw and all volatile accesses fall under this criteria.
977 return HasGFX1250Insts;
978 }
979
980 /// \returns the number of significant bits in the immediate field of the
981 /// S_NOP instruction.
982 unsigned getSNopBits() const {
984 return 7;
986 return 4;
987 return 3;
988 }
989
993
999
1000 /// Return true if real (non-fake) variants of True16 instructions using
1001 /// 16-bit registers should be code-generated. Fake True16 instructions are
1002 /// identical to non-fake ones except that they take 32-bit registers as
1003 /// operands and always use their low halves.
1004 // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
1005 // supported and the support for fake True16 instructions is removed.
1006 bool useRealTrue16Insts() const {
1007 return hasTrue16BitInsts() && EnableRealTrue16Insts;
1008 }
1009
1011 return getGeneration() >= GFX10 || isTgSplitEnabled();
1012 }
1013};
1014
1016public:
1017 bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
1018
1019 bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
1020
1021 bool hasDispatchPtr() const { return DispatchPtr; }
1022
1023 bool hasQueuePtr() const { return QueuePtr; }
1024
1025 bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
1026
1027 bool hasDispatchID() const { return DispatchID; }
1028
1029 bool hasFlatScratchInit() const { return FlatScratchInit; }
1030
1031 bool hasPrivateSegmentSize() const { return PrivateSegmentSize; }
1032
1033 unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
1034
1035 unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
1036
1037 unsigned getNumFreeUserSGPRs();
1038
1039 void allocKernargPreloadSGPRs(unsigned NumSGPRs);
1040
1051
1052 // Returns the size in number of SGPRs for preload user SGPR field.
1054 switch (ID) {
1056 return 2;
1058 return 4;
1059 case DispatchPtrID:
1060 return 2;
1061 case QueuePtrID:
1062 return 2;
1064 return 2;
1065 case DispatchIdID:
1066 return 2;
1067 case FlatScratchInitID:
1068 return 2;
1070 return 1;
1071 }
1072 llvm_unreachable("Unknown UserSGPRID.");
1073 }
1074
1075 GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
1076
1077private:
1078 const GCNSubtarget &ST;
1079
1080 // Private memory buffer
1081 // Compute directly in sgpr[0:1]
1082 // Other shaders indirect 64-bits at sgpr[0:1]
1083 bool ImplicitBufferPtr = false;
1084
1085 bool PrivateSegmentBuffer = false;
1086
1087 bool DispatchPtr = false;
1088
1089 bool QueuePtr = false;
1090
1091 bool KernargSegmentPtr = false;
1092
1093 bool DispatchID = false;
1094
1095 bool FlatScratchInit = false;
1096
1097 bool PrivateSegmentSize = false;
1098
1099 unsigned NumKernargPreloadSGPRs = 0;
1100
1101 unsigned NumUsedUserSGPRs = 0;
1102};
1103
1104} // end namespace llvm
1105
1106#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
static cl::opt< bool > EnableLoadStoreOpt("aarch64-enable-ldst-opt", cl::desc("Enable the load/store pair" " optimization pass"), cl::init(true), cl::Hidden)
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
Base class for AMDGPU specific classes of TargetSubtarget.
#define F(x, y, z)
Definition MD5.cpp:54
SI DAG Lowering interface definition.
Interface definition for SIInstrInfo.
unsigned getWavefrontSizeLog2() const
AMDGPUSubtarget(const Triple &TT)
unsigned getMaxWavesPerEU() const
unsigned getWavefrontSize() const
bool hasPrefetch() const
bool hasFlat() const
bool hasD16Images() const
InstrItineraryData InstrItins
bool useVGPRIndexMode() const
bool partialVCCWritesUpdateVCCZ() const
Writes to VCC_LO/VCC_HI update the VCCZ flag.
bool hasSwap() const
bool hasPkMinMax3Insts() const
bool hasD16LoadStore() const
bool hasMergedShaders() const
bool hasRrWGMode() const
bool hasScalarCompareEq64() const
int getLDSBankCount() const
bool hasOnlyRevVALUShifts() const
bool hasNonNSAEncoding() const
bool hasUsableDivScaleConditionOutput() const
Condition output from div_scale is usable.
bool hasExpertSchedulingMode() const
void mirFileLoaded(MachineFunction &MF) const override
bool hasUsableDSOffset() const
True if the offset field of DS instructions works as expected.
bool loadStoreOptEnabled() const
bool enableSubRegLiveness() const override
unsigned getSGPRAllocGranule() const
bool hasLdsAtomicAddF64() const
bool hasFlatLgkmVMemCountInOrder() const
bool flatScratchIsPointer() const
bool requiresWaitOnWorkgroupReleaseFence() const
bool hasShift64HighRegBug() const
unsigned MaxPrivateElementSize
bool unsafeDSOffsetFoldingEnabled() const
bool hasFPAtomicToDenormModeHazard() const
unsigned getAddressableNumArchVGPRs() const
bool vmemWriteNeedsExpWaitcnt() const
bool shouldClusterStores() const
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
bool hasUserSGPRInit16BugInWave32() const
unsigned getSGPREncodingGranule() const
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
bool hasCompressedExport() const
Return true if the target's EXP instruction has the COMPR flag, which affects the meaning of the EN (...
bool hasFlatScratchHiInB64InstHazard() const
bool hasDstSelForwardingHazard() const
void setScalarizeGlobalBehavior(bool b)
bool hasFlatScratchEnabled() const
unsigned getSNopBits() const
bool hasLDSLoadB96_B128() const
Returns true if the target supports global_load_lds_dwordx3/global_load_lds_dwordx4 or buffer_load_dw...
bool hasMultiDwordFlatScratchAddressing() const
bool hasFmaakFmamkF64Insts() const
bool hasHWFP64() const
bool hasScaleOffset() const
bool hasDenormModeInst() const
bool hasCvtScaleForwardingHazard() const
unsigned getTotalNumVGPRs() const
unsigned getMinWavesPerEU() const override
bool hasUnalignedDSAccessEnabled() const
const SIInstrInfo * getInstrInfo() const override
unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const override
unsigned getConstantBusLimit(unsigned Opcode) const
bool hasVALUMaskWriteHazard() const
bool hasCondSubInsts() const
const InlineAsmLowering * getInlineAsmLowering() const override
unsigned getTotalNumSGPRs() const
const InstrItineraryData * getInstrItineraryData() const override
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const override
void overridePostRASchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount, const Function &) const
Return the amount of LDS that can be used that will not restrict the occupancy lower than WaveCount.
bool hasPkMovB32() const
bool needsAlignedVGPRs() const
Return if operations acting on VGPR tuples require even alignment.
Align getStackAlignment() const
bool privateMemoryResourceIsRangeChecked() const
bool hasScalarSubwordLoads() const
bool hasMadF16() const
bool hasDsAtomicAsyncBarrierArriveB64PipeBug() const
unsigned getInstCacheLineSize() const
Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
bool hasLoopHeadInstSplitSensitivity() const
bool hasDwordx3LoadStores() const
bool hasSignedScratchOffsets() const
bool hasGlobalAddTidInsts() const
bool hasFlatScrRegister() const
bool hasGetPCZeroExtension() const
bool hasPermLane64() const
bool requiresNopBeforeDeallocVGPRs() const
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool supportsGetDoorbellID() const
bool supportsWave32() const
bool isTgSplitEnabled() const
unsigned getMaxNumAGPRs(const Function &F) const
bool hasReadM0MovRelInterpHazard() const
bool isDynamicVGPREnabled() const
const SIRegisterInfo * getRegisterInfo() const override
bool hasDOTOpSelHazard() const
bool hasLdsWaitVMSRC() const
const TargetRegisterClass * getBoolRC() const
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > NumVGPRBounds) const
bool hasFmaakFmamkF32Insts() const
bool hasMad64_32() const
InstructionSelector * getInstructionSelector() const override
unsigned getVGPREncodingGranule() const
bool hasHardClauses() const
bool useDS128() const
bool hasExtendedWaitCounts() const
bool d16PreservesUnusedBits() const
bool hasInstPrefetch() const
bool hasAddPC64Inst() const
unsigned maxHardClauseLength() const
bool isMesaGfxShader(const Function &F) const
bool hasExportInsts() const
bool hasVINTERPEncoding() const
const AMDGPURegisterBankInfo * getRegBankInfo() const override
bool hasLegacyGeometry() const
TrapHandlerAbi getTrapHandlerAbi() const
bool isCuModeEnabled() const
const SIFrameLowering * getFrameLowering() const override
bool hasDPPRowShare() const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
const AMDGPU::IsaInfo::AMDGPUTargetID & getTargetID() const
unsigned getMaxNumPreloadedSGPRs() const
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
bool has12DWordStoreHazard() const
bool hasVALUPartialForwardingHazard() const
void overrideSchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
bool useVGPRBlockOpsForCSR() const
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
bool needsKernArgPreloadProlog() const
bool hasMin3Max3_16() const
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
unsigned getVGPRAllocGranule(unsigned DynamicVGPRBlockSize) const
bool hasMadU64U32NoCarry() const
unsigned getSetRegWaitStates() const
Number of hazard wait states for s_setreg_b32/s_setreg_imm32_b32.
const SITargetLowering * getTargetLowering() const override
bool hasTransForwardingHazard() const
bool enableMachineScheduler() const override
bool hasLDSFPAtomicAddF64() const
unsigned getNSAThreshold(const MachineFunction &MF) const
bool getScalarizeGlobalBehavior() const
bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const
bool hasReadM0LdsDmaHazard() const
bool hasScalarSMulU64() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool hasScratchBaseForwardingHazard() const
bool hasIntMinMax64() const
bool hasScalarPackInsts() const
bool requiresDisjointEarlyClobberAndUndef() const override
bool hasVALUReadSGPRHazard() const
bool usePRTStrictNull() const
bool hasMovB64() const
unsigned getAddressableNumVGPRs(unsigned DynamicVGPRBlockSize) const
bool supportsWaveWideBPermute() const
bool hasMed3_16() const
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
bool hasUnalignedScratchAccessEnabled() const
bool hasNullExportTarget() const
Return true if the target's EXP instruction supports the NULL export target.
bool ldsRequiresM0Init() const
Return if most LDS instructions have an m0 use that require m0 to be initialized.
bool useRealTrue16Insts() const
Return true if real (non-fake) variants of True16 instructions using 16-bit registers should be code-...
bool useAA() const override
bool isWave32() const
bool isGFX11Plus() const
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs, unsigned DynamicVGPRBlockSize) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
bool hasUnalignedBufferAccessEnabled() const
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
unsigned getMinFlatWorkGroupSize() const override
bool hasAsyncMark() const
bool hasSPackHL() const
Return true if the target has the S_PACK_HL_B32_B16 instruction.
bool supportsMinMaxDenormModes() const
bool supportsWave64() const
bool supportsBPermute() const
bool hasFormattedMUBUFInsts() const
bool hasFlatScratchSVSMode() const
unsigned InstCacheLineSize
bool supportsWGP() const
bool hasAtomicFaddInsts() const
bool hasSubClampInsts() const
bool requiresWaitXCntForSingleAccessInstructions() const
unsigned getNSAMaxSize(bool HasSampler=false) const
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
bool hasVOP3DPP() const
unsigned getMaxFlatWorkGroupSize() const override
unsigned getMaxNumUserSGPRs() const
unsigned MaxHardClauseLength
The maximum number of instructions that may be placed within an S_CLAUSE, which is one greater than t...
bool hasPermLaneX16() const
bool hasFlatScratchSVSSwizzleBug() const
bool hasVDecCoExecHazard() const
bool hasSignedGVSOffset() const
bool hasLDSFPAtomicAddF32() const
unsigned getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override
bool haveRoundOpsF64() const
Have v_trunc_f64, v_ceil_f64, v_rndne_f64.
bool hasDelayAlu() const
Return true if the target has the S_DELAY_ALU instruction.
bool hasReadM0SendMsgHazard() const
bool hasVectorMulU64() const
bool hasScalarMulHiInsts() const
bool hasSCmpK() const
const LegalizerInfo * getLegalizerInfo() const override
bool requiresWaitIdleBeforeGetReg() const
bool hasDS96AndDS128() const
bool hasReadM0LdsDirectHazard() const
static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI)
Generation getGeneration() const
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM)
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
std::pair< unsigned, unsigned > getMaxNumVectorRegs(const Function &F) const
Return a pair of maximum numbers of VGPRs and AGPRs that meet the number of waves per execution unit ...
bool isXNACKEnabled() const
bool hasScalarAddSub64() const
bool hasSplitBarriers() const
bool enableEarlyIfConversion() const override
bool hasSMRDReadVALUDefHazard() const
A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR was written by a VALU inst...
bool hasSGetShaderCyclesInst() const
bool hasINVWBL2WaitCntRequirement() const
bool hasRFEHazards() const
bool hasVMEMReadSGPRVALUDefHazard() const
A read of an SGPR by a VMEM instruction requires 5 wait states when the SGPR was written by a VALU In...
bool hasFlatScratchSTMode() const
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
bool hasGWSSemaReleaseAll() const
bool hasAddr64() const
unsigned getAddressableNumSGPRs() const
bool hasReadVCCZBug() const
Extra wait hazard is needed in some cases before s_cbranch_vccnz/s_cbranch_vccz.
bool isWave64() const
unsigned getDynamicVGPRBlockSize() const
bool setRegModeNeedsVNOPs() const
bool hasFractBug() const
bool isPreciseMemoryEnabled() const
unsigned getMaxWaveScratchSize() const
bool hasLDSMisalignedBugInWGPMode() const
bool hasMTBUFInsts() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
~GCNSubtarget() override
const SelectionDAGTargetInfo * getSelectionDAGInfo() const override
bool hasVOPD3() const
bool hasAtomicCSub() const
AMDGPU::IsaInfo::AMDGPUTargetID TargetID
bool requiresCodeObjectV6() const
const CallLowering * getCallLowering() const override
bool hasLdsDirect() const
bool hasGWSAutoReplay() const
static unsigned getNumUserSGPRForField(UserSGPRID ID)
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
bool hasPrivateSegmentBuffer() const
unsigned getNumKernargPreloadSGPRs() const
unsigned getNumUsedUserSGPRs() const
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST)
Itinerary data supplied by a subtarget to be used by a target.
Scheduling dependency.
Definition ScheduleDAG.h:51
const TargetRegisterClass * getBoolRC() const
Scheduling unit. This is a node in the scheduling DAG.
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Provide an instruction scheduling machine model to CodeGen passes.
TargetSubtargetInfo - Generic base class for all target subtargets.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI, std::optional< bool > EnableWavefrontSize32)
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI)
unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize)
unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI, unsigned FlatWorkGroupSize)
unsigned getSGPREncodingGranule(const MCSubtargetInfo *STI)
constexpr unsigned getMaxFlatWorkGroupSize()
unsigned getAddressableNumSGPRs(const MCSubtargetInfo *STI)
unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU)
unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI)
unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize, std::optional< bool > EnableWavefrontSize32)
unsigned getMaxNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, bool Addressable)
unsigned getMinWavesPerEU(const MCSubtargetInfo *STI)
unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, unsigned DynamicVGPRBlockSize)
unsigned getSGPRAllocGranule(const MCSubtargetInfo *STI)
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU, unsigned DynamicVGPRBlockSize)
unsigned getAddressableNumArchVGPRs(const MCSubtargetInfo *STI)
unsigned getTotalNumSGPRs(const MCSubtargetInfo *STI)
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI, unsigned DynamicVGPRBlockSize)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
This is an optimization pass for GlobalISel generic memory operations.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:261
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
A region of an MBB for scheduling.