You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
81 lines
2.4 KiB
81 lines
2.4 KiB
2 months ago
|
.global _start
|
||
|
_start:
|
||
|
.rodata
|
||
|
.align 0x10
|
||
|
.global code.kd
|
||
|
.type code.kd,STT_OBJECT
|
||
|
# amd_kernel_code_t (must be at 0x440 for kernel_code_entry_byte_offset to be right)
|
||
|
code.kd:
|
||
|
# amd_kernel_..., amd_machine_...
|
||
|
.long 0,0,0,0
|
||
|
# kernel_code_entry_byte_offset, kernel_code_prefetch_byte_offset
|
||
|
.long 0x00000bc0,0x00000000,0x00000000,0x00000000
|
||
|
# kernel_code_prefetch_byte_size, max_scratch_backing_memory_byte_size
|
||
|
.long 0,0,0,0
|
||
|
# compute_pgm_rsrc1, compute_pgm_rsrc2, kernel_code_properties, workitem_private_segment_byte_size
|
||
|
.long 0x60af0000,0x0000009e,0x00000408,0x00000000
|
||
|
# compute_pgm_rsrc1 |= AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_32 | AMD_COMPUTE_PGM_RSRC_ONE_FLOAT_DENORM_MODE_16_64
|
||
|
# compute_pgm_rsrc1 |= AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_DX10_CLAMP | AMD_COMPUTE_PGM_RSRC_ONE_ENABLE_IEEE_MODE
|
||
|
# compute_pgm_rsrc2 |= AMD_COMPUTE_PGM_RSRC_TWO_USER_SGPR_COUNT = 0xF
|
||
|
# compute_pgm_rsrc2 |= AMD_COMPUTE_PGM_RSRC_TWO_ENABLE_SGPR_WORKGROUP_ID_X
|
||
|
# kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_KERNARG_SEGMENT_PTR = 1
|
||
|
# kernel_code_properties |= AMD_KERNEL_CODE_PROPERTIES_RESERVED1 = 1
|
||
|
.text
|
||
|
.global code
|
||
|
.type code,STT_FUNC
|
||
|
code:
|
||
|
# https://llvm.org/docs/AMDGPUUsage.html#initial-kernel-execution-state
|
||
|
# s[0:1] contains the kernarg_address
|
||
|
# TODO: can we use s[2:3] if this was really a wave since we only alloced 2 SGPRs?
|
||
|
s_load_b64 s[2:3], s[0:1], null
|
||
|
|
||
|
s_mov_b32 s8, 0
|
||
|
loop:
|
||
|
s_addk_i32 s8, 1
|
||
|
s_cmp_eq_u32 s8, 100000
|
||
|
// FLOPS
|
||
|
s_cbranch_scc0 loop
|
||
|
|
||
|
# wait for the s_load_b64
|
||
|
s_waitcnt lgkmcnt(0)
|
||
|
|
||
|
v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 2.0
|
||
|
global_store_b32 v0, v1, s[2:3]
|
||
|
|
||
|
# Deallocate all VGPRs for this wave. Use only when next instruction is S_ENDPGM.
|
||
|
s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
||
|
s_endpgm
|
||
|
s_code_end
|
||
|
|
||
|
.amdgpu_metadata
|
||
|
amdhsa.kernels:
|
||
|
- .args:
|
||
|
- .address_space: global
|
||
|
.name: a
|
||
|
.offset: 0
|
||
|
.size: 8
|
||
|
.type_name: 'float*'
|
||
|
.value_kind: global_buffer
|
||
|
.group_segment_fixed_size: 0
|
||
|
.kernarg_segment_align: 8
|
||
|
.kernarg_segment_size: 8
|
||
|
.language: OpenCL C
|
||
|
.language_version:
|
||
|
- 1
|
||
|
- 2
|
||
|
.max_flat_workgroup_size: 256
|
||
|
.name: code
|
||
|
.private_segment_fixed_size: 0
|
||
|
.sgpr_count: 2
|
||
|
.sgpr_spill_count: 0
|
||
|
.symbol: code.kd
|
||
|
.uses_dynamic_stack: false
|
||
|
.vgpr_count: 256
|
||
|
.vgpr_spill_count: 0
|
||
|
.wavefront_size: 32
|
||
|
amdhsa.target: amdgcn-amd-amdhsa--gfx1100
|
||
|
amdhsa.version:
|
||
|
- 1
|
||
|
- 2
|
||
|
.end_amdgpu_metadata
|