openpilot is an open source driver assistance system. openpilot performs the functions of Automated Lane Centering and Adaptive Cruise Control for over 200 supported car makes and models.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

214 lines
8.8 KiB

import ctypes, struct, platform, pathlib, os, binascii, itertools
from hexdump import hexdump
from tinygrad.helpers import to_mv, DEBUG, getenv, colored, time_to_str
from tinygrad.runtime.autogen import libc, cuda
from tinygrad.device import CPUProgram, Device
from tinygrad.runtime.support.elf import elf_loader
from tinygrad.runtime.ops_cuda import cu_time_execution
print(f"hooking CUDA runtime, running with {Device.DEFAULT}")
# TODO: regen and make cuda 12 default?
cuda.cuFuncGetParamInfo = cuda._libraries['libcuda.so'].cuFuncGetParamInfo
cuda.cuFuncGetParamInfo.restype = cuda.CUresult
cuda.cuFuncGetParamInfo.argtypes = [cuda.CUfunction, cuda.size_t, ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_uint64)]
ignore_dispatch = [False] # default valus is False
def push_ignore_dispatch(val):
global ignore_dispatch
ignore_dispatch.append(val)
def pop_ignore_dispatch():
global ignore_dispatch
ignore_dispatch.pop()
hooked = {}
def _hook(fxn_address_value, tramp):
page_address = (fxn_address_value//0x1000)*0x1000
ret = libc.mprotect(page_address, 0x2000, 7)
assert ret == 0
libc.memcpy(fxn_address_value, tramp, len(tramp))
ret = libc.mprotect(page_address, 0x2000, 5)
assert ret == 0
CPUProgram.rt_lib["__clear_cache"](fxn_address_value, fxn_address_value + len(tramp))
def install_hook(c_function, python_function):
python_function_addr = ctypes.cast(ctypes.byref(python_function), ctypes.POINTER(ctypes.c_ulong)).contents.value
# AARCH64 trampoline to ioctl
if (processor:=platform.processor()) == "aarch64":
# 0x0000000000000000: 70 00 00 10 adr x16, #0xc
# 0x0000000000000004: 10 02 40 F9 ldr x16, [x16]
# 0x0000000000000008: 00 02 1F D6 br x16
tramp = b"\x70\x00\x00\x10\x10\x02\x40\xf9\x00\x02\x1f\xd6"
tramp += struct.pack("Q", python_function_addr)
elif processor == "x86_64":
# 0x0000000000000000: 49 BB aa aa aa aa aa aa aa aa movabs r11, <address>
# 0x000000000000000a: 41 FF E3 jmp r11
tramp = b"\x49\xBB" + struct.pack("Q", python_function_addr) + b"\x41\xFF\xE3"
else:
raise Exception(f"processor {processor} not supported")
tramp = ctypes.create_string_buffer(tramp)
# get real function address
fxn_address = ctypes.cast(ctypes.byref(c_function), ctypes.POINTER(ctypes.c_ulong))
fxn_address_value = fxn_address.contents.value
#print(f"** hooking function at 0x{fxn_address_value}")
orig_save = (ctypes.c_char*len(tramp))()
libc.memcpy(orig_save, fxn_address_value, len(tramp))
_hook(fxn_address_value, tramp)
def original(*args):
_hook(fxn_address_value, orig_save)
ret = c_function(*args)
_hook(fxn_address_value, tramp)
return ret
return original
allocated_memory_enum = 0
allocated_memory = {}
function_names = {}
tiny_devs = {}
seen_modules = set()
global_events = []
class HookEvent: pass
class HookMemAllocEvent(HookEvent):
def __init__(self, cuda_address, bytesize, enum): self.cuda_address, self.bytesize, self.enum = cuda_address, bytesize, enum
def __repr__(self): return f"tensor alloc: {self.enum}: {self.cuda_address:#x} - {self.bytesize:#x} bytes"
class HookConstParamEvent(HookEvent):
def __init__(self, value): self.value = value
def __repr__(self): return f"const({self.value:#x})"
class HookTensorParamEvent(HookEvent):
def __init__(self, cuda_address, offset, enum): self.cuda_address, self.offset, self.enum = cuda_address, offset, enum
def __repr__(self): return f"tensor{self.enum}({self.cuda_address:#x}, {self.offset=:#x})"
class HookKernelCallEvent(HookEvent):
def __init__(self, grid, block, tm, ptm, name, params): self.grid, self.block, self.tm, self.ptm, self.name, self.params = grid, block, tm, ptm, name, params
def __repr__(self): return f"kernel call <<{self.grid}>> <<{self.block}>> {self.ptm}\n | {self.params}\n | {self.name}"
def collect_events(clear=False):
global global_events
x = global_events
if clear: global_events = []
return x
@ctypes.CFUNCTYPE(*([cuda.cuDeviceGet.restype] + cuda.cuDeviceGet.argtypes))
def cuDeviceGet(device, ordinal):
tiny_devs[ordinal] = Device[f"{Device.DEFAULT}:{ordinal}"]
device.contents.value = ordinal
return cuda.CUDA_SUCCESS
@ctypes.CFUNCTYPE(*([cuda.cuMemHostAlloc.restype] + cuda.cuMemHostAlloc.argtypes))
def cuMemHostAlloc(pp, bytesize, flags):
print(f"cuMemHostAlloc {bytesize}")
return hooked["cuMemHostAlloc"](pp, bytesize, flags)
@ctypes.CFUNCTYPE(*([cuda.cuModuleLoadData.restype] + cuda.cuModuleLoadData.argtypes))
def cuModuleLoadData(module, image):
ret = hooked["cuModuleLoadData"](module, image)
module_address = ctypes.addressof(module.contents.contents)
seen_modules.add(module_address)
return ret
@ctypes.CFUNCTYPE(*([cuda.cuModuleGetFunction.restype] + cuda.cuModuleGetFunction.argtypes))
def cuModuleGetFunction(hfunc, hmod, name):
ret = hooked["cuModuleGetFunction"](hfunc, hmod, name)
python_name = ctypes.string_at(name).decode()
# pip install git+https://github.com/wbenny/pydemangler.git
import pydemangler
demangled_name = pydemangler.demangle(python_name)
if demangled_name is not None: python_name = demangled_name
# print(f"called cuModuleGetFunction 0x{ctypes.addressof(hmod.contents):X} {python_name}")
function_names[ctypes.addressof(hfunc.contents.contents)] = python_name
return ret
@ctypes.CFUNCTYPE(*([cuda.cuMemAlloc_v2.restype] + cuda.cuMemAlloc_v2.argtypes))
def cuMemAlloc_v2(dptr, bytesize):
global allocated_memory_enum, text_prefix
ret = hooked["cuMemAlloc_v2"](dptr, bytesize)
cuda_address = dptr.contents.value
allocated_memory[cuda_address] = (bytesize, allocated_memory_enum)
global_events.append(HookMemAllocEvent(cuda_address, bytesize, allocated_memory_enum))
if DEBUG >= 3: print(global_events[-1])
allocated_memory_enum += 1
return ret
@ctypes.CFUNCTYPE(*([cuda.cuLaunchKernel.restype] + cuda.cuLaunchKernel.argtypes))
def cuLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra):
global ignore_dispatch
name = function_names[ctypes.addressof(f.contents)]
if ignore_dispatch[-1]:
if DEBUG >= 4: print(f"ignoring dispatch {name}")
return 0
tm = cu_time_execution(lambda:
hooked["cuLaunchKernel"](f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra), True)
ptm = colored(time_to_str(tm, w=9), "yellow" if tm > 0.01 else "green")
params = []
while True:
ret = cuda.cuFuncGetParamInfo(f, len(params), ctypes.byref(paramOffset:=ctypes.c_size_t()), ctypes.byref(paramSize:=ctypes.c_size_t()))
if ret != 0: break
params.append((paramOffset.value, paramSize.value))
ev_params = []
if extra: params_ptr = to_mv(extra, 5*8).cast("Q")
else: params_ptr = to_mv(kernelParams, len(params)*8).cast("Q")
for i,(off,sz) in enumerate(params):
sz_to_let = {1: 'B', 2: 'H', 4: 'I', 8: 'Q'}
if sz >= 8:
for j in range(sz//8):
if extra: value = to_mv(params_ptr[1] + off, sz).cast("Q")[0]
else: value = to_mv(params_ptr[i] + j*8, 8).cast('Q')[0]
has_in_allocated_mem, lcoff, alnum = False, 0, -1
for taddr, (tsz, talnum) in allocated_memory.items():
if taddr <= value < taddr + tsz:
has_in_allocated_mem = True
lcoff = value - taddr
alnum = talnum
break
if has_in_allocated_mem: ev_params.append(HookTensorParamEvent(value, lcoff, alnum))
else: ev_params.append(HookConstParamEvent(value))
else:
if extra: value = to_mv(params_ptr[1] + off, sz).cast(sz_to_let[sz])[0]
else: value = to_mv(params_ptr[i], sz).cast(sz_to_let[sz])[0]
ev_params.append(HookConstParamEvent(value))
global_events.append(HookKernelCallEvent((gridDimX, gridDimY, gridDimZ), (blockDimX, blockDimY, blockDimZ), tm, ptm, name, ev_params))
if DEBUG >= 3: print(global_events[-1])
return 0
def create_hook(func_name, restype, argtypes):
def hook_template(*args):
# print(func_name, flush=True)
return hooked[func_name](*args)
return ctypes.CFUNCTYPE(restype, *argtypes)(hook_template)
def install_hooks():
hooked['cuModuleGetFunction'] = install_hook(cuda.cuModuleGetFunction, cuModuleGetFunction)
hooked['cuLaunchKernel'] = install_hook(cuda.cuLaunchKernel, cuLaunchKernel)
# memory stuff
hooked['cuMemAlloc_v2'] = install_hook(cuda.cuMemAlloc_v2, cuMemAlloc_v2)
hooked['cuMemHostAlloc'] = install_hook(cuda.cuMemHostAlloc, cuMemHostAlloc)
# module loading + not used module loading
hooked['cuModuleLoadData'] = install_hook(cuda.cuModuleLoadData, cuModuleLoadData)
NVPROFILER = os.environ.get("NV_COMPUTE_PROFILER_PERFWORKS_DIR", None) # realize and wait each aten call
if NVPROFILER is None: install_hooks()
else:
print("Detected NSIGHT Profiled, hooking not avail.")
cuda._libraries['libcuda.so'] = ctypes.CDLL(NVPROFILER + "/libcuda-injection.so")