import numpy as np import pathlib from hexdump import hexdump from tinygrad.helpers import colored from extra.helpers import enable_early_exec early_exec = enable_early_exec() from tinygrad.runtime.ops_gpu import CLProgram, CLBuffer, ROCM_LLVM_PATH ENABLE_NON_ASM = False WMMA = True DUAL_ALU = True F32 = True if ENABLE_NON_ASM: buf = CLBuffer.fromCPU(np.zeros(10, np.float32)) prg_empty = CLProgram("code", "__kernel void code(__global float *a) { a[0] = 1; }") asm_real = prg_empty.binary() with open("/tmp/cc.elf", "wb") as f: f.write(asm_real) prg_empty([1], [1], buf, wait=True) print(buf.toCPU()) print(colored("creating CLBuffer", "green")) buf = CLBuffer.fromCPU(np.zeros(10, np.float32)) code = open(pathlib.Path(__file__).parent / "prog.s", "r").read() gen = [] FLOPS = 0 MAX_REG = 251 for j in range(1): if WMMA: KY, KX = 4, 4 for y in range(KY): for x in range(KX): c = (y*KX+x)*8 a = (KY*KX*8) + y*8 b = (KY*KX*8) + (KY*8) + x*8 gen.append(f"v_wmma_f32_16x16x16_f16 v[{c}:{c+7}], v[{a}:{a+7}], v[{b}:{b+7}], v[{c}:{c+7}]") FLOPS += 16*8*2 else: for i in range(0, MAX_REG, 6): if DUAL_ALU: if F32: gen.append(f"v_dual_fmac_f32 v{i+0}, v{i+1}, v{i+2} :: v_dual_fmac_f32 v{i+3}, v{i+4}, v{i+5}") FLOPS += 4 else: gen.append(f"v_dual_dot2acc_f32_f16 v{i+0}, v{i+1}, v{i+2} :: v_dual_dot2acc_f32_f16 v{i+3}, v{i+4}, v{i+5}") FLOPS += 8 else: assert F32 gen.append(f"v_fmac_f32 v{i+0}, v{i+1}, v{i+2}") gen.append(f"v_fmac_f32 v{i+3}, v{i+4}, v{i+5}") code = code.replace("// FLOPS", '\n'.join(gen)) print(code) # fix: COMGR failed to get code object ISA name. set triple to 'amdgcn-amd-amdhsa' object = early_exec(([ROCM_LLVM_PATH / "llvm-mc", '--arch=amdgcn', '--mcpu=gfx1100', '--triple=amdgcn-amd-amdhsa', '--filetype=obj', '-'], code.encode("utf-8"))) asm = early_exec(([ROCM_LLVM_PATH / "ld.lld", "/dev/stdin", "-o", "/dev/stdout", "--pie"], object)) with open("/tmp/cc2.o", "wb") as f: f.write(object) with open("/tmp/cc2.elf", "wb") as f: f.write(asm) print(colored("creating CLProgram", "green")) prg = CLProgram("code", asm) print(colored("running program", "green")) G = 512 FLOPS *= 100000*G*G # loop * global_size for i in range(3): tm = prg(buf, global_size=[G//256, G, 1], local_size=[256, 1, 1], wait=True) print(f"ran in {tm*1e3:.2f} ms, {FLOPS/(tm*1e9):.2f} GFLOPS") print(colored("transferring buffer", "green")) print(buf.toCPU())