import numpy as np import unittest import subprocess, struct, math from typing import cast from tinygrad.runtime.ops_amd import AMDProgram, AMDDevice from tinygrad import Tensor, dtypes, Device from tinygrad.helpers import diskcache, OSX, getenv @diskcache def assemble(code:str) -> bytes: try: LLVM_MC = "llvm-mc" if OSX else "/opt/rocm/llvm/bin/llvm-mc" return subprocess.run([LLVM_MC, "--arch=amdgcn", "--mcpu=gfx1100", "--triple=amdgcn-amd-amdhsa", "-filetype=obj", "-o", "-"], input=code.encode("utf-8"), stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True).stdout except subprocess.CalledProcessError as e: print("stderr:") print(e.stderr.decode()) raise # copied from extra/rdna def get_prg(code:str, v_cnt:int, s_cnt:int): function_name = "test" metadata = f""" amdhsa.kernels: - .args: - .address_space: global .name: buf_0 .offset: 0 .size: 8 .type_name: unsigned int* .value_kind: global_buffer .group_segment_fixed_size: 0 .kernarg_segment_align: 8 .kernarg_segment_size: 8 .language: OpenCL C .language_version: - 1 - 2 .max_flat_workgroup_size: 256 .name: test .private_segment_fixed_size: 0 .sgpr_count: {s_cnt} .sgpr_spill_count: 0 .symbol: test.kd .uses_dynamic_stack: false .vgpr_count: {v_cnt} .vgpr_spill_count: 0 .wavefront_size: 32 amdhsa.target: amdgcn-amd-amdhsa--gfx1100 amdhsa.version: - 1 - 2 """ boilerplate_start = f""" .rodata .global {function_name}.kd .type {function_name}.kd,STT_OBJECT .align 0x10 .amdhsa_kernel {function_name}""" kernel_desc = { '.amdhsa_group_segment_fixed_size': 0, '.amdhsa_private_segment_fixed_size': 0, '.amdhsa_kernarg_size': 0, '.amdhsa_next_free_vgpr': v_cnt, # this matters! '.amdhsa_reserve_vcc': 0, '.amdhsa_reserve_xnack_mask': 0, '.amdhsa_next_free_sgpr': s_cnt, '.amdhsa_float_round_mode_32': 0, '.amdhsa_float_round_mode_16_64': 0, '.amdhsa_float_denorm_mode_32': 3, '.amdhsa_float_denorm_mode_16_64': 3, '.amdhsa_dx10_clamp': 1, '.amdhsa_ieee_mode': 1, '.amdhsa_fp16_overflow': 0, '.amdhsa_workgroup_processor_mode': 1, '.amdhsa_memory_ordered': 1, '.amdhsa_forward_progress': 0, '.amdhsa_enable_private_segment': 0, '.amdhsa_system_sgpr_workgroup_id_x': 1, '.amdhsa_system_sgpr_workgroup_id_y': 1, '.amdhsa_system_sgpr_workgroup_id_z': 1, '.amdhsa_system_sgpr_workgroup_info': 0, '.amdhsa_system_vgpr_workitem_id': 2, # is amdhsa_system_vgpr_workitem_id real? '.amdhsa_exception_fp_ieee_invalid_op': 0, '.amdhsa_exception_fp_denorm_src': 0, '.amdhsa_exception_fp_ieee_div_zero': 0, '.amdhsa_exception_fp_ieee_overflow': 0, '.amdhsa_exception_fp_ieee_underflow': 0, '.amdhsa_exception_fp_ieee_inexact': 0, '.amdhsa_exception_int_div_zero': 0, '.amdhsa_user_sgpr_dispatch_ptr': 0, '.amdhsa_user_sgpr_queue_ptr': 0, '.amdhsa_user_sgpr_kernarg_segment_ptr': 1, '.amdhsa_user_sgpr_dispatch_id': 0, '.amdhsa_user_sgpr_private_segment_size': 0, '.amdhsa_wavefront_size32': 1, '.amdhsa_uses_dynamic_stack': 0} code_start = f""".end_amdhsa_kernel .text .global {function_name} .type {function_name},@function .p2align 8 {function_name}: """ ret = ".amdgpu_metadata\n" + metadata + ".end_amdgpu_metadata" + boilerplate_start + "\n" + '\n'.join("%s %d" % x for x in kernel_desc.items()) \ + "\n" + code_start + code + f"\n.size {function_name}, .-{function_name}" return AMDProgram(cast(AMDDevice, Device["AMD"]), function_name, assemble(ret)) def get_output(s:str, n_threads:int=1): assert n_threads <= 32 code = "\n".join(["s_load_b64 s[0:1], s[0:1], null", "v_lshlrev_b32_e32 v0, 2, v0", s, "s_waitcnt 0", "global_store_b32 v0, v1, s[0:1]", "s_nop 0", "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)", "s_endpgm"]) test = Tensor.zeros((n_threads,), dtype=dtypes.uint32).contiguous().realize().lazydata.buffer prg = get_prg(code, 32, 32) prg(test._buf, global_size=(1, 1, 1), local_size=(n_threads, 1, 1), wait=True) return test.numpy() def f16_to_bits(x:float) -> int: return struct.unpack(' float: return struct.unpack(' int: return struct.unpack('