openpilot is an open source driver assistance system. openpilot performs the functions of Automated Lane Centering and Adaptive Cruise Control for over 200 supported car makes and models.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

241 lines
9.2 KiB

import ctypes, struct, platform, pathlib, os, binascii
from hexdump import hexdump
from tinygrad.helpers import to_mv, DEBUG, getenv
from tinygrad.runtime.autogen import libc, cuda
from tinygrad.device import CPUProgram
from tinygrad.runtime.support.elf import elf_loader
from tinygrad.runtime.ops_cuda import cu_time_execution
def _hook(fxn_address_value, tramp):
page_address = (fxn_address_value//0x1000)*0x1000
ret = libc.mprotect(page_address, 0x2000, 7)
assert ret == 0
libc.memcpy(fxn_address_value, tramp, len(tramp))
ret = libc.mprotect(page_address, 0x2000, 5)
assert ret == 0
CPUProgram.rt_lib["__clear_cache"](fxn_address_value, fxn_address_value + len(tramp))
def install_hook(c_function, python_function):
python_function_addr = ctypes.cast(ctypes.byref(python_function), ctypes.POINTER(ctypes.c_ulong)).contents.value
# AARCH64 trampoline to ioctl
if (processor:=platform.processor()) == "aarch64":
# 0x0000000000000000: 70 00 00 10 adr x16, #0xc
# 0x0000000000000004: 10 02 40 F9 ldr x16, [x16]
# 0x0000000000000008: 00 02 1F D6 br x16
tramp = b"\x70\x00\x00\x10\x10\x02\x40\xf9\x00\x02\x1f\xd6"
tramp += struct.pack("Q", python_function_addr)
elif processor == "x86_64":
# 0x0000000000000000: 49 BB aa aa aa aa aa aa aa aa movabs r11, <address>
# 0x000000000000000a: 41 FF E3 jmp r11
tramp = b"\x49\xBB" + struct.pack("Q", python_function_addr) + b"\x41\xFF\xE3"
else:
raise Exception(f"processor {processor} not supported")
tramp = ctypes.create_string_buffer(tramp)
# get real function address
fxn_address = ctypes.cast(ctypes.byref(c_function), ctypes.POINTER(ctypes.c_ulong))
fxn_address_value = fxn_address.contents.value
#print(f"** hooking function at 0x{fxn_address_value}")
orig_save = (ctypes.c_char*len(tramp))()
libc.memcpy(orig_save, fxn_address_value, len(tramp))
_hook(fxn_address_value, tramp)
def original(*args):
_hook(fxn_address_value, orig_save)
ret = c_function(*args)
_hook(fxn_address_value, tramp)
return ret
return original
hooked = {}
allocated_memory = {}
function_names = {}
seen_modules = set()
@ctypes.CFUNCTYPE(ctypes.c_int)
def dummy():
print("**** dummy function hook ****")
return -1
@ctypes.CFUNCTYPE(*([cuda.cuInit.restype] + cuda.cuInit.argtypes))
def cuInit(flags):
print("call cuInit", flags)
return hooked["cuInit"](flags)
@ctypes.CFUNCTYPE(*([cuda.cuMemHostAlloc.restype] + cuda.cuMemHostAlloc.argtypes))
def cuMemHostAlloc(pp, bytesize, flags):
print(f"cuMemHostAlloc {bytesize}")
return hooked["cuMemHostAlloc"](pp, bytesize, flags)
@ctypes.CFUNCTYPE(*([cuda.cuModuleLoadData.restype] + cuda.cuModuleLoadData.argtypes))
def cuModuleLoadData(module, image):
ret = hooked["cuModuleLoadData"](module, image)
module_address = ctypes.addressof(module.contents.contents)
print(f"cuModuleLoadData 0x{image:x} -> 0x{module_address:X}")
seen_modules.add(module_address)
#images, sections, relocs = elf_loader(bytes(to_mv(image, 0x100000)))
#for s in sections: print(s)
#print('\n'.join([x for x in maps.split("\n") if 'libcuda' in x]))
#hexdump(to_mv(image, 0x1000))
#image, sections, relocs = elf_loader(to_mv(image))
#print(sections)
return ret
@ctypes.CFUNCTYPE(*([cuda.cuModuleGetFunction.restype] + cuda.cuModuleGetFunction.argtypes))
def cuModuleGetFunction(hfunc, hmod, name):
ret = hooked["cuModuleGetFunction"](hfunc, hmod, name)
python_name = ctypes.string_at(name).decode()
# pip install git+https://github.com/wbenny/pydemangler.git
import pydemangler
demangled_name = pydemangler.demangle(python_name)
if demangled_name is not None: python_name = demangled_name
print(f"called cuModuleGetFunction 0x{ctypes.addressof(hmod.contents):X} {python_name}")
function_names[ctypes.addressof(hfunc.contents.contents)] = python_name
return ret
@ctypes.CFUNCTYPE(*([cuda.cuMemAlloc_v2.restype] + cuda.cuMemAlloc_v2.argtypes))
def cuMemAlloc_v2(dptr, bytesize):
ret = hooked["cuMemAlloc_v2"](dptr, bytesize)
cuda_address = ctypes.addressof(dptr.contents)
allocated_memory[cuda_address] = bytesize
print(f"cuMemAlloc_v2 {bytesize} 0x{cuda_address:X}")
return ret
@ctypes.CFUNCTYPE(*([cuda.cuLaunchKernel.restype] + cuda.cuLaunchKernel.argtypes))
def cuLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra):
tm = cu_time_execution(lambda:
hooked["cuLaunchKernel"](f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra), True)
name = function_names[ctypes.addressof(f.contents)]
print(f"{tm*1e6:9.2f} us -- cuLaunchKernel <<{gridDimX:6d}, {gridDimY:5d}, {gridDimZ:5d}>>",
f"<<{blockDimX:4d}, {blockDimY:4d}, {blockDimZ:4d}>> {sharedMemBytes} {name}")
if extra: hexdump(to_mv(extra, 0x100))
if getenv("PARAMS") and kernelParams:
#print(f"params @ 0x{ctypes.addressof(kernelParams.contents):X}")
params = []
while True:
ret = cuda.cuFuncGetParamInfo(f, len(params), ctypes.byref(paramOffset:=ctypes.c_size_t()), ctypes.byref(paramSize:=ctypes.c_size_t()))
if ret != 0: break
params.append((paramOffset.value, paramSize.value))
#params_dat = to_mv(kernelParams.contents, params[-1][0] + params[-1][1])
params_ptr = to_mv(kernelParams, len(params)*8).cast("Q")
#params_dat = to_mv(kernelParams.contents, params[-1][0] + params[-1][1])
for i,(off,sz) in enumerate(params):
hexdump(to_mv(params_ptr[i], sz))
#hexdump(params_dat)
#for i,(off,sz) in enumerate(params):
# print(f"{i}: offset:{off:3d} size:{sz:3d}") # --", binascii.hexlify(dat).decode())
# hexdump(params_dat[off:off+sz])
#if name == "exp2_kernel_vectorized4_kernel":
# ptr_0 = struct.unpack("Q", params_dat[0x10:0x18])[0]
# hexdump(to_mv(ptr_0, 0x80))
#ptr_1 = struct.unpack("Q", to_mv(ptr_0, 8))[0]
#print(f"params 0x{ctypes.addressof(kernelParams):X}")
#hexdump(to_mv(kernelParams, 0x100))
#print(f"data 0x{to_mv(kernelParams, 8).cast('Q')[0]:X}")
#hexdump(to_mv(kernelParams.contents, 0x80))
#for i,addr in enumerate(to_mv(kernelParams.contents, 0x100).cast("Q")): print(f"{i*8:3d}: {addr:X}")
return 0
if __name__ == "__main__":
#out = cuda.CUmoduleLoadingMode()
#print(cuda.cuModuleGetLoadingMode(ctypes.byref(out)))
#print(out.value)
hooked['cuInit'] = install_hook(cuda.cuInit, cuInit)
hooked['cuModuleGetFunction'] = install_hook(cuda.cuModuleGetFunction, cuModuleGetFunction)
hooked['cuLaunchKernel'] = install_hook(cuda.cuLaunchKernel, cuLaunchKernel)
# memory stuff
hooked['cuMemAlloc_v2'] = install_hook(cuda.cuMemAlloc_v2, cuMemAlloc_v2)
hooked['cuMemHostAlloc'] = install_hook(cuda.cuMemHostAlloc, cuMemHostAlloc)
# module loading + not used module loading
hooked['cuModuleLoadData'] = install_hook(cuda.cuModuleLoadData, cuModuleLoadData)
install_hook(cuda.cuModuleLoad, dummy)
install_hook(cuda.cuModuleLoadDataEx, dummy)
install_hook(cuda.cuModuleLoadFatBinary, dummy)
# library stuff (doesn't seem used)
#install_hook(cuda.cuLibraryLoadData, dummy)
#install_hook(cuda.cuLibraryLoadFromFile, dummy)
#install_hook(cuda.cuLibraryGetModule, dummy)
#install_hook(cuda.cuMemAllocManaged, dummy)
# unused
#install_hook(cuda.cuFuncGetModule, dummy)
#install_hook(cuda.cuModuleGetGlobal_v2, dummy)
# hook v1
#install_hook(cuda._libraries['libcuda.so'].cuModuleGetGlobal, dummy)
#install_hook(cuda._libraries['libcuda.so'].cuMemAlloc, dummy)
#install_hook(cuda._libraries['libcuda.so'].cuLinkComplete, dummy)
#nvjitlink = ctypes.CDLL("/home/tiny/.local/lib/python3.10/site-packages/nvidia/nvjitlink/lib/libnvJitLink.so.12")
#install_hook(nvjitlink.nvJitLinkCreate, dummy)
#nvrtc = ctypes.CDLL("/home/tiny/.local/lib/python3.10/site-packages/nvidia/cuda_nvrtc/lib/libnvrtc.so.11.2")
#nvrtc = ctypes.CDLL("/usr/local/cuda-12.4/targets/x86_64-linux/lib/libnvrtc.so.12.4.127")
#from tinygrad.runtime.autogen import nvrtc
#install_hook(nvrtc.nvrtcCreateProgram, dummy)
#install_hook(nvrtc.nvJitLinkCreate, dummy)
#import tinygrad.runtime.autogen.nvrtc as nvrtc
#install_hook(nvrtc.nvJitLinkCreate, dummy)
#install_hook(nvrtc.nvrtcCreateProgram, dummy)
#hooked['cuLinkCreate'] = install_hook(cuda.cuLinkCreate, dummy)
if getenv("TINYGRAD"):
from tinygrad import Tensor
(Tensor.zeros(6, device="CUDA").contiguous()*2).realize()
exit(0)
print("importing torch...")
import torch
print("torch", torch.__version__, torch.__file__)
if getenv("RESNET"):
import torchvision.models as models
model = models.resnet18(pretrained=True)
model = model.cuda()
model.eval()
if getenv("COMPILE"): model = torch.compile(model)
X = torch.rand(getenv("BS", 1), 3, 288, 288, device='cuda')
model(X)
print("\n\n\n****** second run ******\n")
model(X)
else:
a = torch.zeros(4, 4).cuda()
b = torch.zeros(4, 4).cuda()
print("tensor created")
print(f"a: 0x{a.data_ptr():X}")
print(f"b: 0x{b.data_ptr():X}")
a += 1
b += 2
a = a.exp2()
b = b.exp2()
a += b
#c = a @ b
print("tensor math done", a.cpu().numpy())
# confirm cuda library is right
#maps = pathlib.Path("/proc/self/maps").read_text()
#print('\n'.join([x for x in maps.split("\n") if 'cuda' in x or 'nv' in x]))