openpilot is an open source driver assistance system. openpilot performs the functions of Automated Lane Centering and Adaptive Cruise Control for over 200 supported car makes and models.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

275 lines
15 KiB

1 month ago
# type: ignore
import ctypes, ctypes.util, struct, platform, pathlib, re, time, os, signal
from tinygrad.helpers import from_mv, to_mv, getenv, init_c_struct_t
from hexdump import hexdump
start = time.perf_counter()
# *** ioctl lib ***
libc = ctypes.CDLL(ctypes.util.find_library("c"))
processor = platform.processor()
IOCTL_SYSCALL = {"aarch64": 0x1d, "x86_64":16}[processor]
MMAP_SYSCALL = {"aarch64": 0xde, "x86_64":0x09}[processor]
def get_struct(argp, stype):
return ctypes.cast(ctypes.c_void_p(argp), ctypes.POINTER(stype)).contents
def dump_struct(st):
if getenv("IOCTL", 0) == 0: return
print("\t", st.__class__.__name__, end=" { ")
for v in type(st)._fields_: print(f"{v[0]}={getattr(st, v[0])}", end=" ")
print("}")
def format_struct(s):
sdats = []
for field in s._fields_:
dat = getattr(s, field[0])
if isinstance(dat, int): sdats.append(f"{field[0]}:0x{dat:X}")
else: sdats.append(f"{field[0]}:{dat}")
return sdats
real_func_pool = {}
def install_hook(c_function, python_function):
orig_func = (ctypes.c_char*4096)()
python_function_addr = ctypes.cast(ctypes.byref(python_function), ctypes.POINTER(ctypes.c_ulong)).contents.value
# AARCH64 trampoline to ioctl
if processor == "aarch64":
# 0x0000000000000000: 70 00 00 10 adr x16, #0xc
# 0x0000000000000004: 10 02 40 F9 ldr x16, [x16]
# 0x0000000000000008: 00 02 1F D6 br x16
tramp = b"\x70\x00\x00\x10\x10\x02\x40\xf9\x00\x02\x1f\xd6"
tramp += struct.pack("Q", python_function_addr)
elif processor == "x86_64":
# 0x0000000000000000: 49 BB aa aa aa aa aa aa aa aa movabs r11, <address>
# 0x000000000000000a: 41 FF E3 jmp r11
tramp = b"\x49\xBB" + struct.pack("Q", python_function_addr) + b"\x41\xFF\xE3"
else:
raise Exception(f"processor {processor} not supported")
# get real ioctl address
ioctl_address = ctypes.cast(ctypes.byref(c_function), ctypes.POINTER(ctypes.c_ulong))
# hook ioctl
ret = libc.mprotect(ctypes.c_ulong((ioctl_address.contents.value//0x1000)*0x1000), 0x2000, 7)
assert ret == 0
ret = libc.mprotect(ctypes.c_ulong((ctypes.addressof(orig_func)//0x1000)*0x1000), 0x3000, 7)
assert ret == 0
libc.memcpy(orig_func, ioctl_address.contents, 0x1000)
libc.memcpy(ioctl_address.contents, ctypes.create_string_buffer(tramp), len(tramp))
return orig_func
# *** ioctl lib end ***
import tinygrad.runtime.autogen.nv_gpu as nv_gpu
nvescs = {getattr(nv_gpu, x):x for x in dir(nv_gpu) if x.startswith("NV_ESC")}
nvcmds = {getattr(nv_gpu, x):(x, getattr(nv_gpu, "struct_"+x+"_PARAMS", getattr(nv_gpu, "struct_"+x.replace("_CMD_", "_")+"_PARAMS", None))) for x in dir(nv_gpu) if \
x.startswith("NV") and x[6:].startswith("_CTRL_") and isinstance(getattr(nv_gpu, x), int)}
def get_classes():
hdrpy = (pathlib.Path(__file__).parent.parent.parent / "tinygrad/runtime/autogen/nv_gpu.py").read_text()
clss = re.search(r'NV01_ROOT.*?NV_SEMAPHORE_SURFACE = \(0x000000da\) # macro', hdrpy, re.DOTALL).group()
pattern = r'([0-9a-zA-Z_]*) = +\((0x[0-9a-fA-F]+)\)'
matches = re.findall(pattern, clss, re.MULTILINE)
return {int(num, base=16):name for name, num in matches}
nvclasses = get_classes()
nvuvms = {getattr(nv_gpu, x):x for x in dir(nv_gpu) if x.startswith("UVM_") and nv_gpu.__dict__.get(x+"_PARAMS")}
nvqcmds = {int(getattr(nv_gpu, x)):x for x in dir(nv_gpu) if x[:7] in {"NVC6C0_", "NVC56F_", "NVC6B5_"} and isinstance(getattr(nv_gpu, x), int)}
global_ioctl_id = 0
gpus_user_modes = []
gpus_mmio = []
gpus_fifo = []
@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_ulong, ctypes.c_void_p)
def ioctl(fd, request, argp):
global global_ioctl_id, gpus_user_modes, gpus_mmio
global_ioctl_id += 1
st = time.perf_counter()
ret = libc.syscall(IOCTL_SYSCALL, ctypes.c_int(fd), ctypes.c_ulong(request), ctypes.c_void_p(argp))
et = time.perf_counter()-st
fn = os.readlink(f"/proc/self/fd/{fd}")
#print(f"ioctl {request:8x} {fn:20s}")
idir, size, itype, nr = (request>>30), (request>>16)&0x3FFF, (request>>8)&0xFF, request&0xFF
if getenv("IOCTL", 0) >= 1: print(f"#{global_ioctl_id}: ", end="")
if itype == ord(nv_gpu.NV_IOCTL_MAGIC):
if nr == nv_gpu.NV_ESC_RM_CONTROL:
s = get_struct(argp, nv_gpu.NVOS54_PARAMETERS)
if s.cmd in nvcmds:
name, struc = nvcmds[s.cmd]
if getenv("IOCTL", 0) >= 1:
print(f"NV_ESC_RM_CONTROL cmd={name:30s} hClient={s.hClient}, hObject={s.hObject}, flags={s.flags}, params={s.params}, paramsSize={s.paramsSize}, status={s.status}")
if struc is not None: dump_struct(get_struct(s.params, struc))
elif hasattr(nv_gpu, name+"_PARAMS"): dump_struct(get_struct(argp, getattr(nv_gpu, name+"_PARAMS")))
elif name == "NVA06C_CTRL_CMD_GPFIFO_SCHEDULE": dump_struct(get_struct(argp, nv_gpu.NVA06C_CTRL_GPFIFO_SCHEDULE_PARAMS))
elif name == "NV83DE_CTRL_CMD_GET_MAPPINGS": dump_struct(get_struct(s.params, nv_gpu.NV83DE_CTRL_DEBUG_GET_MAPPINGS_PARAMETERS))
else:
if getenv("IOCTL", 0) >= 1: print("unhandled cmd", hex(s.cmd))
# format_struct(s)
# print(f"{(st-start)*1000:7.2f} ms +{et*1000.:7.2f} ms : {ret:2d} = {name:40s}", ' '.join(format_struct(s)))
elif nr == nv_gpu.NV_ESC_RM_ALLOC:
s = get_struct(argp, nv_gpu.NVOS21_PARAMETERS)
if getenv("IOCTL", 0) >= 1: print(f"NV_ESC_RM_ALLOC hClass={nvclasses.get(s.hClass, f'unk=0x{s.hClass:X}'):30s}, hRoot={s.hRoot}, hObjectParent={s.hObjectParent}, pAllocParms={s.pAllocParms}, hObjectNew={s.hObjectNew} status={s.status}")
if s.pAllocParms is not None:
if s.hClass == nv_gpu.NV01_DEVICE_0: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV0080_ALLOC_PARAMETERS))
if s.hClass == nv_gpu.FERMI_VASPACE_A: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS))
if s.hClass == nv_gpu.NV50_MEMORY_VIRTUAL: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_MEMORY_ALLOCATION_PARAMS))
if s.hClass == nv_gpu.NV1_MEMORY_USER: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_MEMORY_ALLOCATION_PARAMS))
if s.hClass == nv_gpu.NV1_MEMORY_SYSTEM: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_MEMORY_ALLOCATION_PARAMS))
if s.hClass == nv_gpu.GT200_DEBUGGER: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV83DE_ALLOC_PARAMETERS))
if s.hClass == nv_gpu.AMPERE_CHANNEL_GPFIFO_A:
sx = get_struct(s.pAllocParms, nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS)
dump_struct(sx)
gpus_fifo.append((sx.gpFifoOffset, sx.gpFifoEntries))
if s.hClass == nv_gpu.KEPLER_CHANNEL_GROUP_A: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS))
if s.hClass == nv_gpu.TURING_USERMODE_A: gpus_user_modes.append(s.hObjectNew)
elif nr == nv_gpu.NV_ESC_RM_MAP_MEMORY:
# nv_ioctl_nvos33_parameters_with_fd
if getenv("IOCTL", 0) >= 1:
s = get_struct(argp, nv_gpu.NVOS33_PARAMETERS)
print(f"NV_ESC_RM_MAP_MEMORY hClient={s.hClient}, hDevice={s.hDevice}, hMemory={s.hMemory}, length={s.length} flags={s.flags} pLinearAddress={s.pLinearAddress}")
elif nr == nv_gpu.NV_ESC_RM_UPDATE_DEVICE_MAPPING_INFO:
if getenv("IOCTL", 0) >= 1:
s = get_struct(argp, nv_gpu.NVOS56_PARAMETERS)
print(f"NV_ESC_RM_UPDATE_DEVICE_MAPPING_INFO hClient={s.hClient}, hDevice={s.hDevice}, hMemory={s.hMemory}, pOldCpuAddress={s.pOldCpuAddress} pNewCpuAddress={s.pNewCpuAddress} status={s.status}")
elif nr == nv_gpu.NV_ESC_RM_ALLOC_MEMORY:
if getenv("IOCTL", 0) >= 1:
s = get_struct(argp, nv_gpu.nv_ioctl_nvos02_parameters_with_fd)
print(f"NV_ESC_RM_ALLOC_MEMORY fd={s.fd}, hRoot={s.params.hRoot}, hObjectParent={s.params.hObjectParent}, hObjectNew={s.params.hObjectNew}, hClass={s.params.hClass}, flags={s.params.flags}, pMemory={s.params.pMemory}, limit={s.params.limit}, status={s.params.status}")
elif nr == nv_gpu.NV_ESC_ALLOC_OS_EVENT:
if getenv("IOCTL", 0) >= 1:
s = get_struct(argp, nv_gpu.nv_ioctl_alloc_os_event_t)
print(f"NV_ESC_ALLOC_OS_EVENT hClient={s.hClient} hDevice={s.hDevice} fd={s.fd} Status={s.Status}")
elif nr == nv_gpu.NV_ESC_REGISTER_FD:
if getenv("IOCTL", 0) >= 1:
s = get_struct(argp, nv_gpu.nv_ioctl_register_fd_t)
print(f"NV_ESC_REGISTER_FD fd={s.ctl_fd}")
elif nr in nvescs:
if getenv("IOCTL", 0) >= 1: print(nvescs[nr])
else:
if getenv("IOCTL", 0) >= 1: print("unhandled NR", nr)
elif fn.endswith("nvidia-uvm"):
if getenv("IOCTL", 0) >= 1:
print(f"{nvuvms.get(request, f'UVM UNKNOWN {request=}')}")
if nvuvms.get(request) is not None: dump_struct(get_struct(argp, getattr(nv_gpu, nvuvms.get(request)+"_PARAMS")))
if nvuvms.get(request) == "UVM_MAP_EXTERNAL_ALLOCATION":
st = get_struct(argp, getattr(nv_gpu, nvuvms.get(request)+"_PARAMS"))
for i in range(st.gpuAttributesCount):
print("perGpuAttributes[{i}] = ", end="")
dump_struct(st.perGpuAttributes[i])
if getenv("IOCTL") >= 2: print("ioctl", f"{idir=} {size=} {itype=} {nr=} {fd=} {ret=}", fn)
return ret
@ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long)
def _mmap(addr, length, prot, flags, fd, offset):
mmap_type = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long)
orig_mmap = mmap_type(ctypes.addressof(orig_mmap_mv))
ret = orig_mmap(addr, length, prot, flags, fd, offset)
# ll = os.readlink(f"/proc/self/fd/{fd}") if fd >= 0 else ""
print(f"mmap {addr=}, {length=}, {prot=}, {flags=}, {fd=}, {offset=} {ret=}")
return ret
install_hook(libc.ioctl, ioctl)
if getenv("IOCTL") >= 3: orig_mmap_mv = install_hook(libc.mmap, _mmap)
import collections
old_gpputs = collections.defaultdict(int)
def _dump_gpfifo(mark):
launches = []
# print("_dump_gpfifo:", mark)
for start, size in gpus_fifo:
gpfifo_controls = nv_gpu.AmpereAControlGPFifo.from_address(start+size*8)
gpfifo = to_mv(start, size * 8).cast("Q")
while old_gpputs[start] != gpfifo_controls.GPPut:
addr = ((gpfifo[old_gpputs[start]] & ((1 << 40)-1)) >> 2) << 2
pckt_cnt = (gpfifo[old_gpputs[start]]>>42)&((1 << 20)-1)
# print(f"\t{i}: 0x{gpfifo[i % size]:x}: addr:0x{addr:x} packets:{pckt_cnt} sync:{(gpfifo[i % size] >> 63) & 0x1} fetch:{gpfifo[i % size] & 0x1}")
x = _dump_qmd(addr, pckt_cnt)
if isinstance(x, list): launches += x
old_gpputs[start] += 1
old_gpputs[start] %= size
return launches
import types
def make_qmd_struct_type():
fields: List[Tuple[str, Union[Type[ctypes.c_uint64], Type[ctypes.c_uint32]], Any]] = []
bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]
bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]
bits = sorted(bits, key=lambda x: x[1][1])
for i,(name, data) in enumerate(bits):
if i > 0 and (gap:=(data[1] - bits[i-1][1][0] - 1)) != 0: fields.append((f"_reserved{i}", ctypes.c_uint32, gap))
fields.append((name.replace("NVC6C0_QMDV03_00_", "").lower(), ctypes.c_uint32, data[0]-data[1]+1))
if len(fields) >= 2 and fields[-2][0].endswith('_lower') and fields[-1][0].endswith('_upper') and fields[-1][0][:-6] == fields[-2][0][:-6]:
fields = fields[:-2] + [(fields[-1][0][:-6], ctypes.c_uint64, fields[-1][2] + fields[-2][2])]
return init_c_struct_t(tuple(fields))
qmd_struct_t = make_qmd_struct_type()
assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4
def _dump_qmd(address, packets):
qmds = []
gpfifo = to_mv(address, packets * 4).cast("I")
i = 0
while i < packets:
dat = gpfifo[i]
typ = (dat>>28) & 0xF
if typ == 0: break
size = (dat>>16) & 0xFFF
subc = (dat>>13) & 7
mthd = (dat<<2) & 0x7FFF
method_name = nvqcmds.get(mthd, f"unknown method #{mthd}")
if getenv("IOCTL", 0) >= 1:
print(f"\t\t{method_name}, {typ=} {size=} {subc=} {mthd=}")
for j in range(size): print(f"\t\t\t{j}: {gpfifo[i+j+1]} | 0x{gpfifo[i+j+1]:x}")
if mthd == 792:
qmds.append(qmd_struct_t.from_address(address + 12 + i * 4))
elif mthd == nv_gpu.NVC6C0_SEND_PCAS_A:
qmds.append(qmd_struct_t.from_address(gpfifo[i+1] << 8))
i += size + 1
return qmds
# This is to be used in fuzzer, check cuda/nv side by side.
# Return a state which should be compare and compare function.
def before_launch(): _dump_gpfifo("before launch")
def collect_last_launch_state(): return _dump_gpfifo("after launch")
def compare_launch_state(states, good_states):
states = states or list()
good_states = good_states or list()
if len(states) != 1 or len(good_states) != 1:
return False, f"Some states not captured. {len(states)}!=1 || {len(good_states)}!=1"
for i in range(len(states)):
state, good_state = states[i], good_states[i]
for n in ['qmd_major_version', 'invalidate_shader_data_cache', 'invalidate_shader_data_cache',
'sm_global_caching_enable', 'invalidate_texture_header_cache', 'invalidate_texture_sampler_cache',
'barrier_count', 'sampler_index', 'api_visible_call_limit', 'cwd_membar_type', 'sass_version',
'max_sm_config_shared_mem_size', 'register_count_v', 'shared_memory_size']:
if getattr(state, n) != getattr(good_state, n):
return False, f"Field {n} mismatch: {getattr(state, n)} vs {getattr(good_state, n)}"
# Allow NV to allocate more, at least this is not exact problem, so ignore it here.
# Hmm, CUDA minimum is 0x640, is this hw-required minimum (will check)?
if state.shader_local_memory_high_size < good_state.shader_local_memory_high_size and good_state.shader_local_memory_high_size > 0x640:
return False, f"Field shader_local_memory_high_size mismatch: {state.shader_local_memory_high_size}vs{good_state.shader_local_memory_high_size}"
# TODO: Can't request more, since it might not be optimal, but need to investigate their formula for this.. #7133
if state.min_sm_config_shared_mem_size > good_state.min_sm_config_shared_mem_size and good_state.min_sm_config_shared_mem_size > 5:
return (False,
f"Field min_sm_config_shared_mem_size mismatch: {state.min_sm_config_shared_mem_size}vs{good_state.min_sm_config_shared_mem_size}")
if state.target_sm_config_shared_mem_size > good_state.target_sm_config_shared_mem_size and good_state.target_sm_config_shared_mem_size > 5:
return (False,
f"Field target_sm_config_shared_mem_size mismatch: {state.target_sm_config_shared_mem_size}vs{good_state.target_sm_config_shared_mem_size}")
for i in range(8):
if i in {1, 7}: continue # shaders don't use that. what's cuda put here?
n = f"constant_buffer_valid_{i}"
if getattr(state, n) != getattr(good_state, n):
return False, f"Field {n} mismatch: {getattr(state, n)} vs {getattr(good_state, n)}"
return True, "PASS"
# IOCTL=1 PTX=1 CUDA=1 python3 test/test_ops.py TestOps.test_tiny_add