openpilot is an open source driver assistance system. openpilot performs the functions of Automated Lane Centering and Adaptive Cruise Control for over 200 supported car makes and models.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

81 lines
2.5 KiB

import numpy as np
import pathlib
from hexdump import hexdump
from tinygrad.helpers import colored
from extra.helpers import enable_early_exec
early_exec = enable_early_exec()
from tinygrad.runtime.ops_gpu import CLProgram, CLBuffer, ROCM_LLVM_PATH
ENABLE_NON_ASM = False
WMMA = True
DUAL_ALU = True
F32 = True
if ENABLE_NON_ASM:
buf = CLBuffer.fromCPU(np.zeros(10, np.float32))
prg_empty = CLProgram("code", "__kernel void code(__global float *a) { a[0] = 1; }")
asm_real = prg_empty.binary()
with open("/tmp/cc.elf", "wb") as f:
f.write(asm_real)
prg_empty([1], [1], buf, wait=True)
print(buf.toCPU())
print(colored("creating CLBuffer", "green"))
buf = CLBuffer.fromCPU(np.zeros(10, np.float32))
code = open(pathlib.Path(__file__).parent / "prog.s", "r").read()
gen = []
FLOPS = 0
MAX_REG = 251
for j in range(1):
if WMMA:
KY, KX = 4, 4
for y in range(KY):
for x in range(KX):
c = (y*KX+x)*8
a = (KY*KX*8) + y*8
b = (KY*KX*8) + (KY*8) + x*8
gen.append(f"v_wmma_f32_16x16x16_f16 v[{c}:{c+7}], v[{a}:{a+7}], v[{b}:{b+7}], v[{c}:{c+7}]")
FLOPS += 16*8*2
else:
for i in range(0, MAX_REG, 6):
if DUAL_ALU:
if F32:
gen.append(f"v_dual_fmac_f32 v{i+0}, v{i+1}, v{i+2} :: v_dual_fmac_f32 v{i+3}, v{i+4}, v{i+5}")
FLOPS += 4
else:
gen.append(f"v_dual_dot2acc_f32_f16 v{i+0}, v{i+1}, v{i+2} :: v_dual_dot2acc_f32_f16 v{i+3}, v{i+4}, v{i+5}")
FLOPS += 8
else:
assert F32
gen.append(f"v_fmac_f32 v{i+0}, v{i+1}, v{i+2}")
gen.append(f"v_fmac_f32 v{i+3}, v{i+4}, v{i+5}")
code = code.replace("// FLOPS", '\n'.join(gen))
print(code)
# fix: COMGR failed to get code object ISA name. set triple to 'amdgcn-amd-amdhsa'
object = early_exec(([ROCM_LLVM_PATH / "llvm-mc", '--arch=amdgcn', '--mcpu=gfx1100', '--triple=amdgcn-amd-amdhsa', '--filetype=obj', '-'], code.encode("utf-8")))
asm = early_exec(([ROCM_LLVM_PATH / "ld.lld", "/dev/stdin", "-o", "/dev/stdout", "--pie"], object))
with open("/tmp/cc2.o", "wb") as f:
f.write(object)
with open("/tmp/cc2.elf", "wb") as f:
f.write(asm)
print(colored("creating CLProgram", "green"))
prg = CLProgram("code", asm)
print(colored("running program", "green"))
G = 512
FLOPS *= 100000*G*G # loop * global_size
for i in range(3):
tm = prg(buf, global_size=[G//256, G, 1], local_size=[256, 1, 1], wait=True)
print(f"ran in {tm*1e3:.2f} ms, {FLOPS/(tm*1e9):.2f} GFLOPS")
print(colored("transferring buffer", "green"))
print(buf.toCPU())