LLVM  6.0.0svn
Classes | Macros | Typedefs | Enumerations
AMDKernelCodeT.h File Reference
#include "llvm/MC/SubtargetFeature.h"
#include <cstddef>
#include <cstdint>
#include "llvm/Support/Debug.h"
Include dependency graph for AMDKernelCodeT.h:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

struct  hsa_dim3_s
 
struct  hsa_ext_control_directives_s
 The hsa_ext_control_directives_t specifies the values for the HSAIL control directives. More...
 
struct  amd_kernel_code_s
 AMD Kernel Code Object (amd_kernel_code_t). More...
 

Macros

#define AMD_HSA_BITS_SET(dst, mask, val)
 
#define AMD_HSA_BITS_GET(src, mask)   ((src & mask) >> mask ## _SHIFT) \
 

Typedefs

typedef uint8_t hsa_powertwo8_t
 
typedef uint32_t hsa_ext_code_kind_t
 
typedef uint8_t hsa_ext_brig_profile8_t
 
typedef uint8_t hsa_ext_brig_machine_model8_t
 
typedef uint64_t hsa_ext_control_directive_present64_t
 
typedef uint16_t hsa_ext_exception_kind16_t
 
typedef uint32_t hsa_ext_code_kind32_t
 
typedef struct hsa_dim3_s hsa_dim3_t
 
typedef uint32_t amd_code_version32_t
 The version of the amd_*_code_t struct. More...
 
typedef uint64_t amd_compute_pgm_resource_register64_t
 Shader program settings for CS. More...
 
typedef uint32_t amd_code_property32_t
 Every amd_*_code_t has the following properties, which are composed of a number of bit fields. More...
 
typedef struct hsa_ext_control_directives_s hsa_ext_control_directives_t
 The hsa_ext_control_directives_t specifies the values for the HSAIL control directives. More...
 
typedef struct amd_kernel_code_s amd_kernel_code_t
 AMD Kernel Code Object (amd_kernel_code_t). More...
 

Enumerations

enum  amd_code_version_t { AMD_CODE_VERSION_MAJOR = 0, AMD_CODE_VERSION_MINOR = 1 }
 
enum  amd_element_byte_size_t { AMD_ELEMENT_2_BYTES = 0, AMD_ELEMENT_4_BYTES = 1, AMD_ELEMENT_8_BYTES = 2, AMD_ELEMENT_16_BYTES = 3 }
 The values used to define the number of bytes to use for the swizzle element size. More...
 
enum  amd_code_property_mask_t {
  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT = 0, AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH = 1, AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT, AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT = 1,
  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH = 1, AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT, AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT = 2, AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH = 1,
  AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT, AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT = 3, AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH = 1, AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT,
  AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT = 4, AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH = 1, AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT, AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT = 5,
  AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH = 1, AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT, AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT = 6, AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH = 1,
  AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT, AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT = 7, AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH = 1, AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT,
  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT = 8, AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH = 1, AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT, AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT = 9,
  AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH = 1, AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z = ((1 << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT, AMD_CODE_PROPERTY_RESERVED1_SHIFT = 10, AMD_CODE_PROPERTY_RESERVED1_WIDTH = 6,
  AMD_CODE_PROPERTY_RESERVED1 = ((1 << AMD_CODE_PROPERTY_RESERVED1_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED1_SHIFT, AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT = 16, AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH = 1, AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS = ((1 << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH) - 1) << AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT,
  AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT = 17, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH = 2, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE = ((1 << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH) - 1) << AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT, AMD_CODE_PROPERTY_IS_PTR64_SHIFT = 19,
  AMD_CODE_PROPERTY_IS_PTR64_WIDTH = 1, AMD_CODE_PROPERTY_IS_PTR64 = ((1 << AMD_CODE_PROPERTY_IS_PTR64_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_PTR64_SHIFT, AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT = 20, AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH = 1,
  AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK = ((1 << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT, AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT = 21, AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH = 1, AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT,
  AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT = 22, AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH = 1, AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED = ((1 << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH) - 1) << AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT, AMD_CODE_PROPERTY_RESERVED2_SHIFT = 23,
  AMD_CODE_PROPERTY_RESERVED2_WIDTH = 9, AMD_CODE_PROPERTY_RESERVED2 = ((1 << AMD_CODE_PROPERTY_RESERVED2_WIDTH) - 1) << AMD_CODE_PROPERTY_RESERVED2_SHIFT
}
 

Macro Definition Documentation

◆ AMD_HSA_BITS_GET

#define AMD_HSA_BITS_GET (   src,
  mask 
)    ((src & mask) >> mask ## _SHIFT) \

Definition at line 53 of file AMDKernelCodeT.h.

◆ AMD_HSA_BITS_SET

#define AMD_HSA_BITS_SET (   dst,
  mask,
  val 
)
Value:
dst &= (~(1 << mask ## _SHIFT) & ~mask); \
dst |= (((val) << mask ## _SHIFT) & mask)

Definition at line 48 of file AMDKernelCodeT.h.

Referenced by getElementByteSizeValue().

Typedef Documentation

◆ amd_code_property32_t

Every amd_*_code_t has the following properties, which are composed of a number of bit fields.

Every bit field has a mask (AMD_CODE_PROPERTY_*), bit width (AMD_CODE_PROPERTY_*_WIDTH, and bit shift amount (AMD_CODE_PROPERTY_*_SHIFT) for convenient access. Unused bits must be 0.

(Note that bit fields cannot be used as their layout is implementation defined in the C standard and so cannot be used to specify an ABI)

Definition at line 77 of file AMDKernelCodeT.h.

◆ amd_code_version32_t

The version of the amd_*_code_t struct.

Minor versions must be backward compatible.

Definition at line 41 of file AMDKernelCodeT.h.

◆ amd_compute_pgm_resource_register64_t

Shader program settings for CS.

Contains COMPUTE_PGM_RSRC1 and COMPUTE_PGM_RSRC2 registers.

Definition at line 67 of file AMDKernelCodeT.h.

◆ amd_kernel_code_t

AMD Kernel Code Object (amd_kernel_code_t).

GPU CP uses the AMD Kernel Code Object to set up the hardware to execute the kernel dispatch.

Initial Kernel Register State.

Initial kernel register state will be set up by CP/SPI prior to the start of execution of every wavefront. This is limited by the constraints of the current hardware.

The order of the SGPR registers is defined, but the Finalizer can specify which ones are actually setup in the amd_kernel_code_t object using the enable_sgpr_* bit fields. The register numbers used for enabled registers are dense starting at SGPR0: the first enabled register is SGPR0, the next enabled register is SGPR1 etc.; disabled registers do not have an SGPR number.

The initial SGPRs comprise up to 16 User SRGPs that are set up by CP and apply to all waves of the grid. It is possible to specify more than 16 User SGPRs using the enable_sgpr_* bit fields, in which case only the first 16 are actually initialized. These are then immediately followed by the System SGPRs that are set up by ADC/SPI and can have different values for each wave of the grid dispatch.

SGPR register initial state is defined as follows:

Private Segment Buffer (enable_sgpr_private_segment_buffer): Number of User SGPR registers: 4. V# that can be used, together with Scratch Wave Offset as an offset, to access the Private/Spill/Arg segments using a segment address. It must be set as follows:

  • Base address: of the scratch memory area used by the dispatch. It does not include the scratch wave offset. It will be the per process SH_HIDDEN_PRIVATE_BASE_VMID plus any offset from this dispatch (for example there may be a per pipe offset, or per AQL Queue offset).
  • Stride + data_format: Element Size * Index Stride (???)
  • Cache swizzle: ???
  • Swizzle enable: SH_STATIC_MEM_CONFIG.SWIZZLE_ENABLE (must be 1 for scratch)
  • Num records: Flat Scratch Work Item Size / Element Size (???)
  • Dst_sel_*: ???
  • Num_format: ???
  • Element_size: SH_STATIC_MEM_CONFIG.ELEMENT_SIZE (will be DWORD, must agree with amd_kernel_code_t.privateElementSize)
  • Index_stride: SH_STATIC_MEM_CONFIG.INDEX_STRIDE (will be 64 as must be number of wavefront lanes for scratch, must agree with amd_kernel_code_t.wavefrontSize)
  • Add tid enable: 1
  • ATC: from SH_MEM_CONFIG.PRIVATE_ATC,
  • Hash_enable: ???
  • Heap: ???
  • Mtype: from SH_STATIC_MEM_CONFIG.PRIVATE_MTYPE
  • Type: 0 (a buffer) (???)

Dispatch Ptr (enable_sgpr_dispatch_ptr): Number of User SGPR registers: 2. 64 bit address of AQL dispatch packet for kernel actually executing.

Queue Ptr (enable_sgpr_queue_ptr): Number of User SGPR registers: 2. 64 bit address of AmdQueue object for AQL queue on which the dispatch packet was queued.

Kernarg Segment Ptr (enable_sgpr_kernarg_segment_ptr): Number of User SGPR registers: 2. 64 bit address of Kernarg segment. This is directly copied from the kernargPtr in the dispatch packet. Having CP load it once avoids loading it at the beginning of every wavefront.

Dispatch Id (enable_sgpr_dispatch_id): Number of User SGPR registers: 2. 64 bit Dispatch ID of the dispatch packet being executed.

Flat Scratch Init (enable_sgpr_flat_scratch_init): Number of User SGPR registers: 2. This is 2 SGPRs.

For CI/VI: The first SGPR is a 32 bit byte offset from SH_MEM_HIDDEN_PRIVATE_BASE to base of memory for scratch for this dispatch. This is the same offset used in computing the Scratch Segment Buffer base address. The value of Scratch Wave Offset must be added by the kernel code and moved to SGPRn-4 for use as the FLAT SCRATCH BASE in flat memory instructions.

The second SGPR is 32 bit byte size of a single work-item's scratch memory usage. This is directly loaded from the dispatch packet Private Segment Byte Size and rounded up to a multiple of DWORD.

Todo:
[Does CP need to round this to >4 byte alignment?]
The kernel code must move to SGPRn-3 for use as the FLAT SCRATCH SIZE in
flat memory instructions. Having CP load it once avoids loading it at
the beginning of every wavefront.

For PI: This is the 64 bit base address of the scratch backing memory for allocated by CP for this dispatch.

Private Segment Size (enable_sgpr_private_segment_size): Number of User SGPR registers: 1. The 32 bit byte size of a single work-item's scratch memory allocation. This is the value from the dispatch packet. Private Segment Byte Size rounded up by CP to a multiple of DWORD.

Todo:
[Does CP need to round this to >4 byte alignment?]

Having CP load it once avoids loading it at the beginning of every wavefront.

Todo:
[This will not be used for CI/VI since it is the same value as the second SGPR of Flat Scratch Init. However, it is need for PI which changes meaning of Flat Scratchg Init..]

Grid Work-Group Count X (enable_sgpr_grid_workgroup_count_x): Number of User SGPR registers: 1. 32 bit count of the number of work-groups in the X dimension for the grid being executed. Computed from the fields in the HsaDispatchPacket as ((gridSize.x+workgroupSize.x-1)/workgroupSize.x).

Grid Work-Group Count Y (enable_sgpr_grid_workgroup_count_y): Number of User SGPR registers: 1. 32 bit count of the number of work-groups in the Y dimension for the grid being executed. Computed from the fields in the HsaDispatchPacket as ((gridSize.y+workgroupSize.y-1)/workgroupSize.y).

Only initialized if <16 previous SGPRs initialized.

Grid Work-Group Count Z (enable_sgpr_grid_workgroup_count_z): Number of User SGPR registers: 1. 32 bit count of the number of work-groups in the Z dimension for the grid being executed. Computed from the fields in the HsaDispatchPacket as ((gridSize.z+workgroupSize.z-1)/workgroupSize.z).

Only initialized if <16 previous SGPRs initialized.

Work-Group Id X (enable_sgpr_workgroup_id_x): Number of System SGPR registers: 1. 32 bit work group id in X dimension of grid for wavefront. Always present.

Work-Group Id Y (enable_sgpr_workgroup_id_y): Number of System SGPR registers: 1. 32 bit work group id in Y dimension of grid for wavefront.

Work-Group Id Z (enable_sgpr_workgroup_id_z): Number of System SGPR registers: 1. 32 bit work group id in Z dimension of grid for wavefront. If present then Work-group Id Y will also be present

Work-Group Info (enable_sgpr_workgroup_info): Number of System SGPR registers: 1. {first_wave, 14'b0000, ordered_append_term[10:0], threadgroup_size_in_waves[5:0]}

Private Segment Wave Byte Offset (enable_sgpr_private_segment_wave_byte_offset): Number of System SGPR registers: 1. 32 bit byte offset from base of dispatch scratch base. Must be used as an offset with Private/Spill/Arg segment address when using Scratch Segment Buffer. It must be added to Flat Scratch Offset if setting up FLAT SCRATCH for flat addressing.

The order of the VGPR registers is defined, but the Finalizer can specify which ones are actually setup in the amd_kernel_code_t object using the enableVgpr* bit fields. The register numbers used for enabled registers are dense starting at VGPR0: the first enabled register is VGPR0, the next enabled register is VGPR1 etc.; disabled registers do not have an VGPR number.

VGPR register initial state is defined as follows:

Work-Item Id X (always initialized): Number of registers: 1. 32 bit work item id in X dimension of work-group for wavefront lane.

Work-Item Id X (enable_vgpr_workitem_id > 0): Number of registers: 1. 32 bit work item id in Y dimension of work-group for wavefront lane.

Work-Item Id X (enable_vgpr_workitem_id > 0): Number of registers: 1. 32 bit work item id in Z dimension of work-group for wavefront lane.

The setting of registers is being done by existing GPU hardware as follows: 1) SGPRs before the Work-Group Ids are set by CP using the 16 User Data registers. 2) Work-group Id registers X, Y, Z are set by SPI which supports any combination including none. 3) Scratch Wave Offset is also set by SPI which is why its value cannot be added into the value Flat Scratch Offset which would avoid the Finalizer generated prolog having to do the add. 4) The VGPRs are set by SPI which only supports specifying either (X), (X, Y) or (X, Y, Z).

Flat Scratch Dispatch Offset and Flat Scratch Size are adjacent SGRRs so they can be moved as a 64 bit value to the hardware required SGPRn-3 and SGPRn-4 respectively using the Finalizer ?FLAT_SCRATCH? Register.

The global segment can be accessed either using flat operations or buffer operations. If buffer operations are used then the Global Buffer used to access HSAIL Global/Readonly/Kernarg (which are combine) segments using a segment address is not passed into the kernel code by CP since its base address is always 0. Instead the Finalizer generates prolog code to initialize 4 SGPRs with a V# that has the following properties, and then uses that in the buffer instructions:

  • base address of 0
  • no swizzle
  • ATC=1
  • MTYPE set to support memory coherence specified in amd_kernel_code_t.globalMemoryCoherence

When the Global Buffer is used to access the Kernarg segment, must add the dispatch packet kernArgPtr to a kernarg segment address before using this V#. Alternatively scalar loads can be used if the kernarg offset is uniform, as the kernarg segment is constant for the duration of the kernel execution.

◆ hsa_dim3_t

typedef struct hsa_dim3_s hsa_dim3_t

◆ hsa_ext_brig_machine_model8_t

Definition at line 28 of file AMDKernelCodeT.h.

◆ hsa_ext_brig_profile8_t

typedef uint8_t hsa_ext_brig_profile8_t

Definition at line 27 of file AMDKernelCodeT.h.

◆ hsa_ext_code_kind32_t

Definition at line 31 of file AMDKernelCodeT.h.

◆ hsa_ext_code_kind_t

Definition at line 26 of file AMDKernelCodeT.h.

◆ hsa_ext_control_directive_present64_t

Definition at line 29 of file AMDKernelCodeT.h.

◆ hsa_ext_control_directives_t

The hsa_ext_control_directives_t specifies the values for the HSAIL control directives.

These control how the finalizer generates code. This struct is used both as an argument to hsaFinalizeKernel to specify values for the control directives, and is used in HsaKernelCode to record the values of the control directives that the finalize used when generating the code which either came from the finalizer argument or explicit HSAIL control directives. See the definition of the control directives in HSA Programmer's Reference Manual which also defines how the values specified as finalizer arguments have to agree with the control directives in the HSAIL code.

◆ hsa_ext_exception_kind16_t

typedef uint16_t hsa_ext_exception_kind16_t

Definition at line 30 of file AMDKernelCodeT.h.

◆ hsa_powertwo8_t

typedef uint8_t hsa_powertwo8_t

Definition at line 25 of file AMDKernelCodeT.h.

Enumeration Type Documentation

◆ amd_code_property_mask_t

Enumerator
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_SHIFT 

Enable the setup of the SGPR user data registers (AMD_CODE_PROPERTY_ENABLE_SGPR_*), see documentation of amd_kernel_code_t for initial register state.

The total number of SGPRuser data registers requested must not exceed 16. Any requests beyond 16 will be ignored.

Used to set COMPUTE_PGM_RSRC2.USER_SGPR (set to total count of SGPR user data registers enabled up to 16).

AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER_WIDTH 
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER 
AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_SHIFT 
AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR_WIDTH 
AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR 
AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_SHIFT 
AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR_WIDTH 
AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR 
AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_SHIFT 
AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR_WIDTH 
AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR 
AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_SHIFT 
AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID_WIDTH 
AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID 
AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_SHIFT 
AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT_WIDTH 
AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT 
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_SHIFT 
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE_WIDTH 
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE 
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_SHIFT 
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X_WIDTH 
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X 
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_SHIFT 
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y_WIDTH 
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y 
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_SHIFT 
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z_WIDTH 
AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z 
AMD_CODE_PROPERTY_RESERVED1_SHIFT 
AMD_CODE_PROPERTY_RESERVED1_WIDTH 
AMD_CODE_PROPERTY_RESERVED1 
AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_SHIFT 

Control wave ID base counter for GDS ordered-append.

Used to set COMPUTE_DISPATCH_INITIATOR.ORDERED_APPEND_ENBL. (Not sure if ORDERED_APPEND_MODE also needs to be settable)

AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS_WIDTH 
AMD_CODE_PROPERTY_ENABLE_ORDERED_APPEND_GDS 
AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_SHIFT 

The interleave (swizzle) element size in bytes required by the code for private memory.

This must be 2, 4, 8 or 16. This value is provided to the finalizer when it is invoked and is recorded here. The hardware will interleave the memory requests of each lane of a wavefront by this element size to ensure each work-item gets a distinct memory memory location. Therefore, the finalizer ensures that all load and store operations done to private memory do not exceed this size. For example, if the element size is 4 (32-bits or dword) and a 64-bit value must be loaded, the finalizer will generate two 32-bit loads. This ensures that the interleaving will get the work-item specific dword for both halves of the 64-bit value. If it just did a 64-bit load then it would get one dword which belonged to its own work-item, but the second dword would belong to the adjacent lane work-item since the interleaving is in dwords.

The value used must match the value that the runtime configures the GPU flat scratch (SH_STATIC_MEM_CONFIG.ELEMENT_SIZE). This is generally DWORD.

uSE VALUES FROM THE AMD_ELEMENT_BYTE_SIZE_T ENUM.

AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE_WIDTH 
AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE 
AMD_CODE_PROPERTY_IS_PTR64_SHIFT 

Are global memory addresses 64 bits.

Must match amd_kernel_code_t.hsail_machine_model == HSA_MACHINE_LARGE. Must also match SH_MEM_CONFIG.PTR32 (GFX6 (SI)/GFX7 (CI)), SH_MEM_CONFIG.ADDRESS_MODE (GFX8 (VI)+).

AMD_CODE_PROPERTY_IS_PTR64_WIDTH 
AMD_CODE_PROPERTY_IS_PTR64 
AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_SHIFT 

Indicate if the generated ISA is using a dynamically sized call stack.

This can happen if calls are implemented using a call stack and recursion, alloca or calls to indirect functions are present. In these cases the Finalizer cannot compute the total private segment size at compile time. In this case the workitem_private_segment_byte_size only specifies the statically know private segment size, and additional space must be added for the call stack.

AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK_WIDTH 
AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK 
AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_SHIFT 

Indicate if code generated has support for debugging.

AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED_WIDTH 
AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED 
AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_SHIFT 
AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED_WIDTH 
AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED 
AMD_CODE_PROPERTY_RESERVED2_SHIFT 
AMD_CODE_PROPERTY_RESERVED2_WIDTH 
AMD_CODE_PROPERTY_RESERVED2 

Definition at line 78 of file AMDKernelCodeT.h.

◆ amd_code_version_t

Enumerator
AMD_CODE_VERSION_MAJOR 
AMD_CODE_VERSION_MINOR 

Definition at line 42 of file AMDKernelCodeT.h.

◆ amd_element_byte_size_t

The values used to define the number of bytes to use for the swizzle element size.

Enumerator
AMD_ELEMENT_2_BYTES 
AMD_ELEMENT_4_BYTES 
AMD_ELEMENT_8_BYTES 
AMD_ELEMENT_16_BYTES 

Definition at line 58 of file AMDKernelCodeT.h.