openpilot is an open source driver assistance system. openpilot performs the functions of Automated Lane Centering and Adaptive Cruise Control for over 200 supported car makes and models.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

68 lines
2.5 KiB

from tinygrad import Device, dtypes
from tinygrad.helpers import getenv, colorize_float
from extra.optimization.helpers import load_worlds, ast_str_to_lin
from test.external.fuzz_linearizer import get_fuzz_rawbufs
from tinygrad.engine.search import bufs_from_lin
from tinygrad.engine.realize import CompiledRunner
from tinygrad.tensor import _to_np_dtype
import numpy as np
if __name__ == "__main__":
ast_strs = load_worlds(filter_reduce=False, filter_novariable=True)
cudev = Device["CUDA"]
nvdev = Device["NV"]
# NUM=112 python3 test/external/speed_compare_cuda_nv.py
single = getenv("NUM", -1)
if single != -1: ast_strs = ast_strs[single:single+1]
average_tm_cuda, average_tm_nv = 0, 0
for num,ast in enumerate(ast_strs):
# cuda compile
culin = ast_str_to_lin(ast, opts=cudev.renderer)
culin.hand_coded_optimizations()
has_bf16 = any(b.dtype == dtypes.bfloat16 for b in culin.membufs)
cuda_prg = CompiledRunner(culin.to_program())
cubufs = bufs_from_lin(culin)
test_cubufs = get_fuzz_rawbufs(culin) if not has_bf16 else cubufs
rdr = nvdev.renderer
rdr.device = "NV"
nvlin = ast_str_to_lin(ast, opts=rdr)
nvlin.hand_coded_optimizations()
nv_prg = CompiledRunner(nvlin.to_program())
nvbufs = bufs_from_lin(nvlin)
test_nvbufs = get_fuzz_rawbufs(nvlin) if not has_bf16 else nvbufs
if not has_bf16:
for i,rawbuf in enumerate(test_nvbufs): rawbuf.copyin(test_cubufs[i].as_buffer())
# warmup
tm_cuda, tm_nv, failed = [], [], False
try:
cuda_prg(test_cubufs, {}, wait=True)
for i in range(5): tm_cuda.append(cuda_prg(cubufs, {}, wait=True))
except RuntimeError:
print("CUDA FAILED")
tm_cuda = [1e9]
failed = True
try:
nv_prg(test_nvbufs, {}, wait=True)
for i in range(5): tm_nv.append(nv_prg(nvbufs, {}, wait=True))
except RuntimeError:
print("NV FAILED")
tm_nv = [1e9]
failed = True
if not failed and not has_bf16:
curesult = np.frombuffer(test_cubufs[0].as_buffer(), _to_np_dtype(test_cubufs[0].dtype))
nvresult = np.frombuffer(test_nvbufs[0].as_buffer(), _to_np_dtype(test_nvbufs[0].dtype))
np.testing.assert_allclose(curesult, nvresult, rtol=1e-2, atol=1e-2)
average_tm_cuda += min(tm_cuda)
average_tm_nv += min(tm_nv)
ratio = min(tm_nv)/min(tm_cuda)
print(f"{average_tm_nv/average_tm_cuda:5.2f}x -- {num:4d} {colorize_float(ratio)} {min(tm_nv)*1e6:7.2f} us", nvlin.name)
if ratio > 1.04: print(f"NV slower {ratio}", nvlin.ast, nvlin.applied_opts)