import numpy as np import unittest import subprocess, struct, math from tinygrad import Tensor, dtypes, Device, UOp from tinygrad.helpers import getenv from tinygrad.runtime.support.compiler_amd import amdgpu_disassemble from tinygrad.renderer import ProgramSpec from tinygrad.engine.realize import CompiledRunner def get_output(asm:str, n_threads:int=1): input_asm = "\n".join([ln if ln.strip().startswith('asm volatile') else f'asm volatile("{ln.strip().lstrip()}" : "+v"(a), "+v"(b));' for ln in asm.strip().splitlines() if ln.strip()]) src = f""" typedef long unsigned int size_t; extern "C" __attribute__((device, const)) size_t __ockl_get_local_id(unsigned int); extern "C" __attribute__((global)) void __attribute__((amdgpu_flat_work_group_size(1, {n_threads}))) test(unsigned int* data0_1) {{ int l = __ockl_get_local_id(0); unsigned a = 0, b = 0, c = 0; {input_asm} unsigned res; asm volatile("v_mov_b32 %0, %1" : "=v"(res) : "v"(a)); *(data0_1+l) = res; }}""" t = Tensor.zeros(n_threads, dtype=dtypes.uint32).contiguous().realize() prg = ProgramSpec("test", src, Device.DEFAULT, UOp.sink(t), global_size=[1, 1, 1], local_size=[n_threads, 1, 1]) car = CompiledRunner(prg) if getenv("PRINT_ASM"): amdgpu_disassemble(car.lib) car([t.uop.buffer], {}, wait=True) return t.numpy() def f16_to_bits(x:float) -> int: return struct.unpack(' float: return struct.unpack(' int: return struct.unpack('