You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
80 lines
2.5 KiB
80 lines
2.5 KiB
import numpy as np
|
|
import pathlib
|
|
from hexdump import hexdump
|
|
from tinygrad.helpers import colored
|
|
from extra.helpers import enable_early_exec
|
|
early_exec = enable_early_exec()
|
|
|
|
from tinygrad.runtime.ops_gpu import CLProgram, CLBuffer, ROCM_LLVM_PATH
|
|
|
|
ENABLE_NON_ASM = False
|
|
|
|
WMMA = True
|
|
DUAL_ALU = True
|
|
F32 = True
|
|
|
|
if ENABLE_NON_ASM:
|
|
buf = CLBuffer.fromCPU(np.zeros(10, np.float32))
|
|
prg_empty = CLProgram("code", "__kernel void code(__global float *a) { a[0] = 1; }")
|
|
asm_real = prg_empty.binary()
|
|
with open("/tmp/cc.elf", "wb") as f:
|
|
f.write(asm_real)
|
|
prg_empty([1], [1], buf, wait=True)
|
|
print(buf.toCPU())
|
|
|
|
print(colored("creating CLBuffer", "green"))
|
|
buf = CLBuffer.fromCPU(np.zeros(10, np.float32))
|
|
code = open(pathlib.Path(__file__).parent / "prog.s", "r").read()
|
|
|
|
gen = []
|
|
FLOPS = 0
|
|
MAX_REG = 251
|
|
for j in range(1):
|
|
if WMMA:
|
|
KY, KX = 4, 4
|
|
for y in range(KY):
|
|
for x in range(KX):
|
|
c = (y*KX+x)*8
|
|
a = (KY*KX*8) + y*8
|
|
b = (KY*KX*8) + (KY*8) + x*8
|
|
gen.append(f"v_wmma_f32_16x16x16_f16 v[{c}:{c+7}], v[{a}:{a+7}], v[{b}:{b+7}], v[{c}:{c+7}]")
|
|
FLOPS += 16*8*2
|
|
else:
|
|
for i in range(0, MAX_REG, 6):
|
|
if DUAL_ALU:
|
|
if F32:
|
|
gen.append(f"v_dual_fmac_f32 v{i+0}, v{i+1}, v{i+2} :: v_dual_fmac_f32 v{i+3}, v{i+4}, v{i+5}")
|
|
FLOPS += 4
|
|
else:
|
|
gen.append(f"v_dual_dot2acc_f32_f16 v{i+0}, v{i+1}, v{i+2} :: v_dual_dot2acc_f32_f16 v{i+3}, v{i+4}, v{i+5}")
|
|
FLOPS += 8
|
|
else:
|
|
assert F32
|
|
gen.append(f"v_fmac_f32 v{i+0}, v{i+1}, v{i+2}")
|
|
gen.append(f"v_fmac_f32 v{i+3}, v{i+4}, v{i+5}")
|
|
code = code.replace("// FLOPS", '\n'.join(gen))
|
|
print(code)
|
|
|
|
|
|
# fix: COMGR failed to get code object ISA name. set triple to 'amdgcn-amd-amdhsa'
|
|
|
|
object = early_exec(([ROCM_LLVM_PATH / "llvm-mc", '--arch=amdgcn', '--mcpu=gfx1100', '--triple=amdgcn-amd-amdhsa', '--filetype=obj', '-'], code.encode("utf-8")))
|
|
asm = early_exec(([ROCM_LLVM_PATH / "ld.lld", "/dev/stdin", "-o", "/dev/stdout", "--pie"], object))
|
|
|
|
with open("/tmp/cc2.o", "wb") as f:
|
|
f.write(object)
|
|
with open("/tmp/cc2.elf", "wb") as f:
|
|
f.write(asm)
|
|
|
|
print(colored("creating CLProgram", "green"))
|
|
prg = CLProgram("code", asm)
|
|
|
|
print(colored("running program", "green"))
|
|
G = 512
|
|
FLOPS *= 100000*G*G # loop * global_size
|
|
for i in range(3):
|
|
tm = prg(buf, global_size=[G//256, G, 1], local_size=[256, 1, 1], wait=True)
|
|
print(f"ran in {tm*1e3:.2f} ms, {FLOPS/(tm*1e9):.2f} GFLOPS")
|
|
|
|
print(colored("transferring buffer", "green"))
|
|
print(buf.toCPU())
|
|
|