openpilot is an open source driver assistance system. openpilot performs the functions of Automated Lane Centering and Adaptive Cruise Control for over 200 supported car makes and models.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

202 lines
7.4 KiB

import numpy as np
import unittest
import subprocess, struct, math
from typing import cast
from tinygrad.runtime.ops_amd import AMDProgram, AMDDevice
from tinygrad import Tensor, dtypes, Device
from tinygrad.helpers import diskcache, OSX, getenv
@diskcache
def assemble(code:str) -> bytes:
try:
LLVM_MC = "llvm-mc" if OSX else "/opt/rocm/llvm/bin/llvm-mc"
return subprocess.run([LLVM_MC, "--arch=amdgcn", "--mcpu=gfx1100", "--triple=amdgcn-amd-amdhsa", "-filetype=obj", "-o", "-"],
input=code.encode("utf-8"), stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True).stdout
except subprocess.CalledProcessError as e:
print("stderr:")
print(e.stderr.decode())
raise
# copied from extra/rdna
def get_prg(code:str, v_cnt:int, s_cnt:int):
function_name = "test"
metadata = f"""
amdhsa.kernels:
- .args:
- .address_space: global
.name: buf_0
.offset: 0
.size: 8
.type_name: unsigned int*
.value_kind: global_buffer
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 8
.language: OpenCL C
.language_version:
- 1
- 2
.max_flat_workgroup_size: 256
.name: test
.private_segment_fixed_size: 0
.sgpr_count: {s_cnt}
.sgpr_spill_count: 0
.symbol: test.kd
.uses_dynamic_stack: false
.vgpr_count: {v_cnt}
.vgpr_spill_count: 0
.wavefront_size: 32
amdhsa.target: amdgcn-amd-amdhsa--gfx1100
amdhsa.version:
- 1
- 2
"""
boilerplate_start = f"""
.rodata
.global {function_name}.kd
.type {function_name}.kd,STT_OBJECT
.align 0x10
.amdhsa_kernel {function_name}"""
kernel_desc = {
'.amdhsa_group_segment_fixed_size': 0, '.amdhsa_private_segment_fixed_size': 0, '.amdhsa_kernarg_size': 0,
'.amdhsa_next_free_vgpr': v_cnt, # this matters!
'.amdhsa_reserve_vcc': 0, '.amdhsa_reserve_xnack_mask': 0,
'.amdhsa_next_free_sgpr': s_cnt,
'.amdhsa_float_round_mode_32': 0, '.amdhsa_float_round_mode_16_64': 0, '.amdhsa_float_denorm_mode_32': 3, '.amdhsa_float_denorm_mode_16_64': 3,
'.amdhsa_dx10_clamp': 1, '.amdhsa_ieee_mode': 1, '.amdhsa_fp16_overflow': 0,
'.amdhsa_workgroup_processor_mode': 1, '.amdhsa_memory_ordered': 1, '.amdhsa_forward_progress': 0, '.amdhsa_enable_private_segment': 0,
'.amdhsa_system_sgpr_workgroup_id_x': 1, '.amdhsa_system_sgpr_workgroup_id_y': 1, '.amdhsa_system_sgpr_workgroup_id_z': 1,
'.amdhsa_system_sgpr_workgroup_info': 0, '.amdhsa_system_vgpr_workitem_id': 2, # is amdhsa_system_vgpr_workitem_id real?
'.amdhsa_exception_fp_ieee_invalid_op': 0, '.amdhsa_exception_fp_denorm_src': 0,
'.amdhsa_exception_fp_ieee_div_zero': 0, '.amdhsa_exception_fp_ieee_overflow': 0, '.amdhsa_exception_fp_ieee_underflow': 0,
'.amdhsa_exception_fp_ieee_inexact': 0, '.amdhsa_exception_int_div_zero': 0,
'.amdhsa_user_sgpr_dispatch_ptr': 0, '.amdhsa_user_sgpr_queue_ptr': 0, '.amdhsa_user_sgpr_kernarg_segment_ptr': 1,
'.amdhsa_user_sgpr_dispatch_id': 0, '.amdhsa_user_sgpr_private_segment_size': 0, '.amdhsa_wavefront_size32': 1, '.amdhsa_uses_dynamic_stack': 0}
code_start = f""".end_amdhsa_kernel
.text
.global {function_name}
.type {function_name},@function
.p2align 8
{function_name}:
"""
ret = ".amdgpu_metadata\n" + metadata + ".end_amdgpu_metadata" + boilerplate_start + "\n" + '\n'.join("%s %d" % x for x in kernel_desc.items()) \
+ "\n" + code_start + code + f"\n.size {function_name}, .-{function_name}"
return AMDProgram(cast(AMDDevice, Device["AMD"]), function_name, assemble(ret))
def get_output(s:str, n_threads:int=1):
assert n_threads <= 32
code = "\n".join(["s_load_b64 s[0:1], s[0:1], null", "v_lshlrev_b32_e32 v0, 2, v0", s,
"s_waitcnt 0",
"global_store_b32 v0, v1, s[0:1]",
"s_nop 0", "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)", "s_endpgm"])
test = Tensor.zeros((n_threads,), dtype=dtypes.uint32).contiguous().realize().lazydata.buffer
prg = get_prg(code, 32, 32)
prg(test._buf, global_size=(1, 1, 1), local_size=(n_threads, 1, 1), wait=True)
return test.numpy()
def f16_to_bits(x:float) -> int: return struct.unpack('<H', struct.pack('<e', x))[0]
def f32_from_bits(x:int) -> float: return struct.unpack('<f', struct.pack('<I', x))[0]
def f32_to_bits(x:float) -> int: return struct.unpack('<I', struct.pack('<f', x))[0]
@unittest.skipUnless(Device.DEFAULT == "AMD", "tests RDNA3")
class TestHW(unittest.TestCase):
def setUp(self):
if getenv("MOCKGPU"): subprocess.run(["cargo", "build", "--release", "--manifest-path", "./extra/remu/Cargo.toml"], check=True)
def test_simple(self):
out = get_output("""
v_mov_b32_e32 v10 42
v_mov_b32_e32 v1 v10
""", n_threads=2)
np.testing.assert_equal(out, 42)
def test_exec_mov(self):
out = get_output("""
v_mov_b32_e32 v10 42
s_mov_b32_e32 exec_lo 0b10
v_mov_b32_e32 v10 10
s_mov_b32_e32 exec_lo 0b11
v_mov_b32_e32 v1 v10
""", n_threads=2)
np.testing.assert_equal(out, [42, 10])
def test_exec_cmp_vopc(self):
out = get_output("""
s_mov_b32 vcc_lo 0 // reset vcc
v_mov_b32_e32 v10 42
v_mov_b32_e32 v11 10
s_mov_b32_e32 exec_lo 0b01
v_cmp_ne_u32 v10 v11
s_mov_b32_e32 exec_lo 0b11
v_mov_b32_e32 v1 vcc_lo
""", n_threads=2)
np.testing.assert_equal(out, 0b01)
def test_exec_cmpx_vop3(self):
out = get_output("""
v_mov_b32_e32 v10 42
v_mov_b32_e32 v11 10
s_mov_b32_e32 exec_lo 0b01
v_cmpx_ne_u32 v10 v11
s_mov_b32_e32 s10 exec_lo
s_mov_b32_e32 exec_lo 0b11
v_mov_b32_e32 v1 s10
""", n_threads=2)
np.testing.assert_equal(out, 0b01)
def test_fmac_vop3_modifier(self):
init_state = f"""
v_mov_b32_e32 v10 {f16_to_bits(4.0)}
v_mov_b32_e32 v11 {f16_to_bits(3.0)}
v_mov_b32_e32 v1 {f16_to_bits(2.0)}
"""
self.assertEqual(get_output(init_state+"\n"+"v_fmac_f16_e64 v1 v11 v10"), f16_to_bits(14.))
self.assertEqual(get_output(init_state+"\n"+"v_fmac_f16_e64 v1 -v11 v10"), f16_to_bits(-10.))
self.assertEqual(get_output(init_state+"\n"+"v_fmac_f16_e64 v1 -v11 -v10"), f16_to_bits(14.))
def test_s_abs_i32(self):
def s_abs_i32(x, y, dst="s10", scc=0):
for reg,val in [(dst, y), ("scc", scc)]:
self.assertEqual(get_output(f"""
s_mov_b32_e32 {dst} {x}
s_abs_i32 {dst} {dst}
v_mov_b32_e32 v1 {reg}
""")[0], val)
s_abs_i32(0x00000001, 0x00000001, scc=1)
s_abs_i32(0x7fffffff, 0x7fffffff, scc=1)
s_abs_i32(0x80000000, 0x80000000, scc=1)
s_abs_i32(0x80000001, 0x7fffffff, scc=1)
s_abs_i32(0x80000002, 0x7ffffffe, scc=1)
s_abs_i32(0xffffffff, 0x00000001, scc=1)
s_abs_i32(0, 0, scc=0)
def test_v_rcp_f32_neg_vop3(self):
def v_neg_rcp_f32(x:float, y:float):
out = get_output(f"""
v_mov_b32_e32 v1 {f32_to_bits(x)}
v_rcp_f32_e64 v1, -v1
""")[0]
assert out == f32_to_bits(y), f"{f32_from_bits(out)} != {y} / {out} != {f32_to_bits(y)}"
v_neg_rcp_f32(math.inf, -0.0)
v_neg_rcp_f32(-math.inf, 0.0)
v_neg_rcp_f32(0.0, -math.inf)
v_neg_rcp_f32(-0.0, math.inf)
v_neg_rcp_f32(-2.0, 0.5)
v_neg_rcp_f32(2.0, -0.5)
def test_v_cndmask_b32_neg(self):
def v_neg(x:int|float, y:float):
out = get_output(f"""
v_mov_b32_e32 v1 {f32_to_bits(x)}
s_mov_b32_e32 s10 1 // always pick -v1
v_cndmask_b32 v1, v1, -v1 s10
""")[0]
assert out == f32_to_bits(y), f"{f32_from_bits(out)} != {y} / {out} != {f32_to_bits(y)}"
v_neg(-0.0, 0.0)
v_neg(0.0, -0.0)
v_neg(2.0, -2.0)
v_neg(math.inf, -math.inf)
v_neg(-math.inf, math.inf)
if __name__ == "__main__":
unittest.main()