openpilot_comma/tinygrad_repo/extra/remu/test/hwtest.py

import numpy as np
import unittest
import subprocess, struct, math
from typing import cast
from tinygrad.runtime.ops_amd import AMDProgram, AMDDevice
from tinygrad import Tensor, dtypes, Device
from tinygrad.helpers import diskcache, OSX, getenv

@diskcache
def assemble(code:str) -> bytes:
  try:
    LLVM_MC = "llvm-mc" if OSX else "/opt/rocm/llvm/bin/llvm-mc"
    return subprocess.run([LLVM_MC, "--arch=amdgcn", "--mcpu=gfx1100", "--triple=amdgcn-amd-amdhsa", "-filetype=obj", "-o", "-"],
                           input=code.encode("utf-8"), stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True).stdout
  except subprocess.CalledProcessError as e:
    print("stderr:")
    print(e.stderr.decode())
    raise

# copied from extra/rdna
def get_prg(code:str, v_cnt:int, s_cnt:int):
  function_name = "test"
  metadata = f"""
amdhsa.kernels:
- .args:
  - .address_space: global
    .name: buf_0
    .offset: 0
    .size: 8
    .type_name: unsigned int*
    .value_kind: global_buffer
  .group_segment_fixed_size: 0
  .kernarg_segment_align: 8
  .kernarg_segment_size: 8
  .language: OpenCL C
  .language_version:
  - 1
  - 2
  .max_flat_workgroup_size: 256
  .name: test
  .private_segment_fixed_size: 0
  .sgpr_count: {s_cnt}
  .sgpr_spill_count: 0
  .symbol: test.kd
  .uses_dynamic_stack: false
  .vgpr_count: {v_cnt}
  .vgpr_spill_count: 0
  .wavefront_size: 32
amdhsa.target: amdgcn-amd-amdhsa--gfx1100
amdhsa.version:
- 1
- 2
  """
  boilerplate_start = f"""
  .rodata
  .global {function_name}.kd
  .type {function_name}.kd,STT_OBJECT
  .align 0x10
  .amdhsa_kernel {function_name}"""
  kernel_desc = {
    '.amdhsa_group_segment_fixed_size': 0, '.amdhsa_private_segment_fixed_size': 0, '.amdhsa_kernarg_size': 0,
    '.amdhsa_next_free_vgpr': v_cnt,   # this matters!
    '.amdhsa_reserve_vcc': 0, '.amdhsa_reserve_xnack_mask': 0,
    '.amdhsa_next_free_sgpr': s_cnt,
    '.amdhsa_float_round_mode_32': 0, '.amdhsa_float_round_mode_16_64': 0, '.amdhsa_float_denorm_mode_32': 3, '.amdhsa_float_denorm_mode_16_64': 3,
    '.amdhsa_dx10_clamp': 1, '.amdhsa_ieee_mode': 1, '.amdhsa_fp16_overflow': 0,
    '.amdhsa_workgroup_processor_mode': 1, '.amdhsa_memory_ordered': 1, '.amdhsa_forward_progress': 0, '.amdhsa_enable_private_segment': 0,
    '.amdhsa_system_sgpr_workgroup_id_x': 1, '.amdhsa_system_sgpr_workgroup_id_y': 1, '.amdhsa_system_sgpr_workgroup_id_z': 1,
    '.amdhsa_system_sgpr_workgroup_info': 0, '.amdhsa_system_vgpr_workitem_id': 2, # is amdhsa_system_vgpr_workitem_id real?
    '.amdhsa_exception_fp_ieee_invalid_op': 0, '.amdhsa_exception_fp_denorm_src': 0,
    '.amdhsa_exception_fp_ieee_div_zero': 0, '.amdhsa_exception_fp_ieee_overflow': 0, '.amdhsa_exception_fp_ieee_underflow': 0,
    '.amdhsa_exception_fp_ieee_inexact': 0, '.amdhsa_exception_int_div_zero': 0,
    '.amdhsa_user_sgpr_dispatch_ptr': 0, '.amdhsa_user_sgpr_queue_ptr': 0, '.amdhsa_user_sgpr_kernarg_segment_ptr': 1,
    '.amdhsa_user_sgpr_dispatch_id': 0, '.amdhsa_user_sgpr_private_segment_size': 0, '.amdhsa_wavefront_size32': 1, '.amdhsa_uses_dynamic_stack': 0}
  code_start = f""".end_amdhsa_kernel
  .text
  .global {function_name}
  .type {function_name},@function
  .p2align 8
  {function_name}:
  """
  ret = ".amdgpu_metadata\n" + metadata + ".end_amdgpu_metadata" + boilerplate_start + "\n" + '\n'.join("%s %d" % x for x in kernel_desc.items()) \
      + "\n" + code_start + code + f"\n.size {function_name}, .-{function_name}"
  return AMDProgram(cast(AMDDevice, Device["AMD"]), function_name, assemble(ret))

def get_output(s:str, n_threads:int=1):
  assert n_threads <= 32
  code = "\n".join(["s_load_b64 s[0:1], s[0:1], null", "v_lshlrev_b32_e32 v0, 2, v0", s,
    "s_waitcnt 0",
    "global_store_b32 v0, v1, s[0:1]",
    "s_nop 0", "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)", "s_endpgm"])
  test = Tensor.zeros((n_threads,), dtype=dtypes.uint32).contiguous().realize().uop.buffer
  prg = get_prg(code, 32, 32)
  prg(test._buf, global_size=(1, 1, 1), local_size=(n_threads, 1, 1), wait=True)
  return test.numpy()

def f16_to_bits(x:float) -> int: return struct.unpack('<H', struct.pack('<e', x))[0]
def f32_from_bits(x:int) -> float: return struct.unpack('<f', struct.pack('<I', x))[0]
def f32_to_bits(x:float) -> int: return struct.unpack('<I', struct.pack('<f', x))[0]

@unittest.skipUnless(Device.DEFAULT == "AMD", "tests RDNA3")
class TestHW(unittest.TestCase):
  def setUp(self):
    if getenv("MOCKGPU"): subprocess.run(["cargo", "build", "--release", "--manifest-path", "./extra/remu/Cargo.toml"], check=True)

  def test_simple(self):
    out = get_output("""
    v_mov_b32_e32 v10 42
    v_mov_b32_e32 v1 v10
    """, n_threads=2)
    np.testing.assert_equal(out, 42)

  def test_exec_mov(self):
    out = get_output("""
    v_mov_b32_e32 v10 42
    s_mov_b32_e32 exec_lo 0b10
    v_mov_b32_e32 v10 10
    s_mov_b32_e32 exec_lo 0b11
    v_mov_b32_e32 v1 v10
    """, n_threads=2)
    np.testing.assert_equal(out, [42, 10])

  def test_exec_cmp_vopc(self):
    out = get_output("""
    s_mov_b32 vcc_lo 0 // reset vcc
    v_mov_b32_e32 v10 42
    v_mov_b32_e32 v11 10
    s_mov_b32_e32 exec_lo 0b01
    v_cmp_ne_u32 v10 v11
    s_mov_b32_e32 exec_lo 0b11
    v_mov_b32_e32 v1 vcc_lo
    """, n_threads=2)
    np.testing.assert_equal(out, 0b01)

  def test_exec_cmpx_vop3(self):
    out = get_output("""
    v_mov_b32_e32 v10 42
    v_mov_b32_e32 v11 10
    s_mov_b32_e32 exec_lo 0b01
    v_cmpx_ne_u32 v10 v11
    s_mov_b32_e32 s10 exec_lo
    s_mov_b32_e32 exec_lo 0b11
    v_mov_b32_e32 v1 s10
    """, n_threads=2)
    np.testing.assert_equal(out, 0b01)

  def test_fmac_vop3_modifier(self):
    init_state = f"""
    v_mov_b32_e32 v10 {f16_to_bits(4.0)}
    v_mov_b32_e32 v11 {f16_to_bits(3.0)}
    v_mov_b32_e32 v1 {f16_to_bits(2.0)}
    """
    self.assertEqual(get_output(init_state+"\n"+"v_fmac_f16_e64 v1 v11 v10"), f16_to_bits(14.))
    self.assertEqual(get_output(init_state+"\n"+"v_fmac_f16_e64 v1 -v11 v10"), f16_to_bits(-10.))
    self.assertEqual(get_output(init_state+"\n"+"v_fmac_f16_e64 v1 -v11 -v10"), f16_to_bits(14.))

  def test_s_abs_i32(self):
    def s_abs_i32(x, y, dst="s10", scc=0):
      for reg,val in [(dst, y), ("scc", scc)]:
        self.assertEqual(get_output(f"""
        s_mov_b32_e32 {dst} {x}
        s_abs_i32 {dst} {dst}
        v_mov_b32_e32 v1 {reg}
        """)[0], val)
    s_abs_i32(0x00000001, 0x00000001, scc=1)
    s_abs_i32(0x7fffffff, 0x7fffffff, scc=1)
    s_abs_i32(0x80000000, 0x80000000, scc=1)
    s_abs_i32(0x80000001, 0x7fffffff, scc=1)
    s_abs_i32(0x80000002, 0x7ffffffe, scc=1)
    s_abs_i32(0xffffffff, 0x00000001, scc=1)
    s_abs_i32(0, 0, scc=0)

  def test_v_rcp_f32_neg_vop3(self):
    def v_neg_rcp_f32(x:float, y:float):
      out = get_output(f"""
      v_mov_b32_e32 v1 {f32_to_bits(x)}
      v_rcp_f32_e64 v1, -v1
      """)[0]
      assert out == f32_to_bits(y), f"{f32_from_bits(out)} != {y} / {out} != {f32_to_bits(y)}"
    v_neg_rcp_f32(math.inf, -0.0)
    v_neg_rcp_f32(-math.inf, 0.0)
    v_neg_rcp_f32(0.0, -math.inf)
    v_neg_rcp_f32(-0.0, math.inf)
    v_neg_rcp_f32(-2.0, 0.5)
    v_neg_rcp_f32(2.0, -0.5)

  def test_v_cndmask_b32_neg(self):
    def v_neg(x:int|float, y:float):
      out = get_output(f"""
      v_mov_b32_e32 v1 {f32_to_bits(x)}
      s_mov_b32_e32 s10 1 // always pick -v1
      v_cndmask_b32 v1, v1, -v1 s10
      """)[0]
      assert out == f32_to_bits(y), f"{f32_from_bits(out)} != {y} / {out} != {f32_to_bits(y)}"
    v_neg(-0.0, 0.0)
    v_neg(0.0, -0.0)
    v_neg(2.0, -2.0)
    v_neg(math.inf, -math.inf)
    v_neg(-math.inf, math.inf)

if __name__ == "__main__":
  unittest.main()
openpilot v0.9.9 release (#35334) * openpilot v0.9.9 release date: 2025-06-05T19:54:08 master commit: 8aadf02b2fd91f4e1285e18c2c7feb32d93b66f5 * AGNOS 12.4 (#35558) agnos12.4 --------- Co-authored-by: Vehicle Researcher <user@comma.ai> Co-authored-by: Maxime Desroches <desroches.maxime@gmail.com> 5 days ago			`import numpy as np`
			`import unittest`
			`import subprocess, struct, math`
			`from typing import cast`
			`from tinygrad.runtime.ops_amd import AMDProgram, AMDDevice`
			`from tinygrad import Tensor, dtypes, Device`
			`from tinygrad.helpers import diskcache, OSX, getenv`

			`@diskcache`
			`def assemble(code:str) -> bytes:`
			`try:`
			`LLVM_MC = "llvm-mc" if OSX else "/opt/rocm/llvm/bin/llvm-mc"`
			`return subprocess.run([LLVM_MC, "--arch=amdgcn", "--mcpu=gfx1100", "--triple=amdgcn-amd-amdhsa", "-filetype=obj", "-o", "-"],`
			`input=code.encode("utf-8"), stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True).stdout`
			`except subprocess.CalledProcessError as e:`
			`print("stderr:")`
			`print(e.stderr.decode())`
			`raise`

			`# copied from extra/rdna`
			`def get_prg(code:str, v_cnt:int, s_cnt:int):`
			`function_name = "test"`
			`metadata = f"""`
			`amdhsa.kernels:`
			`- .args:`
			`- .address_space: global`
			`.name: buf_0`
			`.offset: 0`
			`.size: 8`
			`.type_name: unsigned int*`
			`.value_kind: global_buffer`
			`.group_segment_fixed_size: 0`
			`.kernarg_segment_align: 8`
			`.kernarg_segment_size: 8`
			`.language: OpenCL C`
			`.language_version:`
			`- 1`
			`- 2`
			`.max_flat_workgroup_size: 256`
			`.name: test`
			`.private_segment_fixed_size: 0`
			`.sgpr_count: {s_cnt}`
			`.sgpr_spill_count: 0`
			`.symbol: test.kd`
			`.uses_dynamic_stack: false`
			`.vgpr_count: {v_cnt}`
			`.vgpr_spill_count: 0`
			`.wavefront_size: 32`
			`amdhsa.target: amdgcn-amd-amdhsa--gfx1100`
			`amdhsa.version:`
			`- 1`
			`- 2`
			`"""`
			`boilerplate_start = f"""`
			`.rodata`
			`.global {function_name}.kd`
			`.type {function_name}.kd,STT_OBJECT`
			`.align 0x10`
			`.amdhsa_kernel {function_name}"""`
			`kernel_desc = {`
			`'.amdhsa_group_segment_fixed_size': 0, '.amdhsa_private_segment_fixed_size': 0, '.amdhsa_kernarg_size': 0,`
			`'.amdhsa_next_free_vgpr': v_cnt, # this matters!`
			`'.amdhsa_reserve_vcc': 0, '.amdhsa_reserve_xnack_mask': 0,`
			`'.amdhsa_next_free_sgpr': s_cnt,`
			`'.amdhsa_float_round_mode_32': 0, '.amdhsa_float_round_mode_16_64': 0, '.amdhsa_float_denorm_mode_32': 3, '.amdhsa_float_denorm_mode_16_64': 3,`
			`'.amdhsa_dx10_clamp': 1, '.amdhsa_ieee_mode': 1, '.amdhsa_fp16_overflow': 0,`
			`'.amdhsa_workgroup_processor_mode': 1, '.amdhsa_memory_ordered': 1, '.amdhsa_forward_progress': 0, '.amdhsa_enable_private_segment': 0,`
			`'.amdhsa_system_sgpr_workgroup_id_x': 1, '.amdhsa_system_sgpr_workgroup_id_y': 1, '.amdhsa_system_sgpr_workgroup_id_z': 1,`
			`'.amdhsa_system_sgpr_workgroup_info': 0, '.amdhsa_system_vgpr_workitem_id': 2, # is amdhsa_system_vgpr_workitem_id real?`
			`'.amdhsa_exception_fp_ieee_invalid_op': 0, '.amdhsa_exception_fp_denorm_src': 0,`
			`'.amdhsa_exception_fp_ieee_div_zero': 0, '.amdhsa_exception_fp_ieee_overflow': 0, '.amdhsa_exception_fp_ieee_underflow': 0,`
			`'.amdhsa_exception_fp_ieee_inexact': 0, '.amdhsa_exception_int_div_zero': 0,`
			`'.amdhsa_user_sgpr_dispatch_ptr': 0, '.amdhsa_user_sgpr_queue_ptr': 0, '.amdhsa_user_sgpr_kernarg_segment_ptr': 1,`
			`'.amdhsa_user_sgpr_dispatch_id': 0, '.amdhsa_user_sgpr_private_segment_size': 0, '.amdhsa_wavefront_size32': 1, '.amdhsa_uses_dynamic_stack': 0}`
			`code_start = f""".end_amdhsa_kernel`
			`.text`
			`.global {function_name}`
			`.type {function_name},@function`
			`.p2align 8`
			`{function_name}:`
			`"""`
			`ret = ".amdgpu_metadata\n" + metadata + ".end_amdgpu_metadata" + boilerplate_start + "\n" + '\n'.join("%s %d" % x for x in kernel_desc.items()) \`
			`+ "\n" + code_start + code + f"\n.size {function_name}, .-{function_name}"`
			`return AMDProgram(cast(AMDDevice, Device["AMD"]), function_name, assemble(ret))`

			`def get_output(s:str, n_threads:int=1):`
			`assert n_threads <= 32`
			`code = "\n".join(["s_load_b64 s[0:1], s[0:1], null", "v_lshlrev_b32_e32 v0, 2, v0", s,`
			`"s_waitcnt 0",`
			`"global_store_b32 v0, v1, s[0:1]",`
			`"s_nop 0", "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)", "s_endpgm"])`
openpilot v0.9.10 release date: 2025-06-22T09:02:45 master commit: 5e3fc13751dc9b9c5d5e0991a17c672eda8bd122 16 hours ago			`test = Tensor.zeros((n_threads,), dtype=dtypes.uint32).contiguous().realize().uop.buffer`
openpilot v0.9.9 release (#35334) * openpilot v0.9.9 release date: 2025-06-05T19:54:08 master commit: 8aadf02b2fd91f4e1285e18c2c7feb32d93b66f5 * AGNOS 12.4 (#35558) agnos12.4 --------- Co-authored-by: Vehicle Researcher <user@comma.ai> Co-authored-by: Maxime Desroches <desroches.maxime@gmail.com> 5 days ago			`prg = get_prg(code, 32, 32)`
			`prg(test._buf, global_size=(1, 1, 1), local_size=(n_threads, 1, 1), wait=True)`
			`return test.numpy()`

			`def f16_to_bits(x:float) -> int: return struct.unpack('<H', struct.pack('<e', x))[0]`
			`def f32_from_bits(x:int) -> float: return struct.unpack('<f', struct.pack('<I', x))[0]`
			`def f32_to_bits(x:float) -> int: return struct.unpack('<I', struct.pack('<f', x))[0]`

			`@unittest.skipUnless(Device.DEFAULT == "AMD", "tests RDNA3")`
			`class TestHW(unittest.TestCase):`
			`def setUp(self):`
			`if getenv("MOCKGPU"): subprocess.run(["cargo", "build", "--release", "--manifest-path", "./extra/remu/Cargo.toml"], check=True)`

			`def test_simple(self):`
			`out = get_output("""`
			`v_mov_b32_e32 v10 42`
			`v_mov_b32_e32 v1 v10`
			`""", n_threads=2)`
			`np.testing.assert_equal(out, 42)`

			`def test_exec_mov(self):`
			`out = get_output("""`
			`v_mov_b32_e32 v10 42`
			`s_mov_b32_e32 exec_lo 0b10`
			`v_mov_b32_e32 v10 10`
			`s_mov_b32_e32 exec_lo 0b11`
			`v_mov_b32_e32 v1 v10`
			`""", n_threads=2)`
			`np.testing.assert_equal(out, [42, 10])`

			`def test_exec_cmp_vopc(self):`
			`out = get_output("""`
			`s_mov_b32 vcc_lo 0 // reset vcc`
			`v_mov_b32_e32 v10 42`
			`v_mov_b32_e32 v11 10`
			`s_mov_b32_e32 exec_lo 0b01`
			`v_cmp_ne_u32 v10 v11`
			`s_mov_b32_e32 exec_lo 0b11`
			`v_mov_b32_e32 v1 vcc_lo`
			`""", n_threads=2)`
			`np.testing.assert_equal(out, 0b01)`

			`def test_exec_cmpx_vop3(self):`
			`out = get_output("""`
			`v_mov_b32_e32 v10 42`
			`v_mov_b32_e32 v11 10`
			`s_mov_b32_e32 exec_lo 0b01`
			`v_cmpx_ne_u32 v10 v11`
			`s_mov_b32_e32 s10 exec_lo`
			`s_mov_b32_e32 exec_lo 0b11`
			`v_mov_b32_e32 v1 s10`
			`""", n_threads=2)`
			`np.testing.assert_equal(out, 0b01)`

			`def test_fmac_vop3_modifier(self):`
			`init_state = f"""`
			`v_mov_b32_e32 v10 {f16_to_bits(4.0)}`
			`v_mov_b32_e32 v11 {f16_to_bits(3.0)}`
			`v_mov_b32_e32 v1 {f16_to_bits(2.0)}`
			`"""`
			`self.assertEqual(get_output(init_state+"\n"+"v_fmac_f16_e64 v1 v11 v10"), f16_to_bits(14.))`
			`self.assertEqual(get_output(init_state+"\n"+"v_fmac_f16_e64 v1 -v11 v10"), f16_to_bits(-10.))`
			`self.assertEqual(get_output(init_state+"\n"+"v_fmac_f16_e64 v1 -v11 -v10"), f16_to_bits(14.))`

			`def test_s_abs_i32(self):`
			`def s_abs_i32(x, y, dst="s10", scc=0):`
			`for reg,val in [(dst, y), ("scc", scc)]:`
			`self.assertEqual(get_output(f"""`
			`s_mov_b32_e32 {dst} {x}`
			`s_abs_i32 {dst} {dst}`
			`v_mov_b32_e32 v1 {reg}`
			`""")[0], val)`
			`s_abs_i32(0x00000001, 0x00000001, scc=1)`
			`s_abs_i32(0x7fffffff, 0x7fffffff, scc=1)`
			`s_abs_i32(0x80000000, 0x80000000, scc=1)`
			`s_abs_i32(0x80000001, 0x7fffffff, scc=1)`
			`s_abs_i32(0x80000002, 0x7ffffffe, scc=1)`
			`s_abs_i32(0xffffffff, 0x00000001, scc=1)`
			`s_abs_i32(0, 0, scc=0)`

			`def test_v_rcp_f32_neg_vop3(self):`
			`def v_neg_rcp_f32(x:float, y:float):`
			`out = get_output(f"""`
			`v_mov_b32_e32 v1 {f32_to_bits(x)}`
			`v_rcp_f32_e64 v1, -v1`
			`""")[0]`
			`assert out == f32_to_bits(y), f"{f32_from_bits(out)} != {y} / {out} != {f32_to_bits(y)}"`
			`v_neg_rcp_f32(math.inf, -0.0)`
			`v_neg_rcp_f32(-math.inf, 0.0)`
			`v_neg_rcp_f32(0.0, -math.inf)`
			`v_neg_rcp_f32(-0.0, math.inf)`
			`v_neg_rcp_f32(-2.0, 0.5)`
			`v_neg_rcp_f32(2.0, -0.5)`

			`def test_v_cndmask_b32_neg(self):`
			`def v_neg(x:int\|float, y:float):`
			`out = get_output(f"""`
			`v_mov_b32_e32 v1 {f32_to_bits(x)}`
			`s_mov_b32_e32 s10 1 // always pick -v1`
			`v_cndmask_b32 v1, v1, -v1 s10`
			`""")[0]`
			`assert out == f32_to_bits(y), f"{f32_from_bits(out)} != {y} / {out} != {f32_to_bits(y)}"`
			`v_neg(-0.0, 0.0)`
			`v_neg(0.0, -0.0)`
			`v_neg(2.0, -2.0)`
			`v_neg(math.inf, -math.inf)`
			`v_neg(-math.inf, math.inf)`

			`if __name__ == "__main__":`
			`unittest.main()`