openpilot_comma/tinygrad_repo/extra/nv_gpu_driver/nv_ioctl.py

# type: ignore
import ctypes, ctypes.util, struct, platform, pathlib, re, time, os, signal
from tinygrad.helpers import from_mv, to_mv, getenv, init_c_struct_t
from hexdump import hexdump
start = time.perf_counter()

# *** ioctl lib ***
libc = ctypes.CDLL(ctypes.util.find_library("c"))
processor = platform.processor()
IOCTL_SYSCALL = {"aarch64": 0x1d, "x86_64":16}[processor]
MMAP_SYSCALL = {"aarch64": 0xde, "x86_64":0x09}[processor]

def get_struct(argp, stype):
  return ctypes.cast(ctypes.c_void_p(argp), ctypes.POINTER(stype)).contents

def dump_struct(st):
  if getenv("IOCTL", 0) == 0: return
  print("\t", st.__class__.__name__, end=" { ")
  for v in type(st)._fields_: print(f"{v[0]}={getattr(st, v[0])}", end=" ")
  print("}")

def format_struct(s):
  sdats = []
  for field in s._fields_:
    dat = getattr(s, field[0])
    if isinstance(dat, int): sdats.append(f"{field[0]}:0x{dat:X}")
    else: sdats.append(f"{field[0]}:{dat}")
  return sdats

real_func_pool = {}
def install_hook(c_function, python_function):
  orig_func = (ctypes.c_char*4096)()
  python_function_addr = ctypes.cast(ctypes.byref(python_function), ctypes.POINTER(ctypes.c_ulong)).contents.value
  # AARCH64 trampoline to ioctl
  if processor == "aarch64":
    # 0x0000000000000000:  70 00 00 10    adr x16, #0xc
    # 0x0000000000000004:  10 02 40 F9    ldr x16, [x16]
    # 0x0000000000000008:  00 02 1F D6    br  x16
    tramp = b"\x70\x00\x00\x10\x10\x02\x40\xf9\x00\x02\x1f\xd6"
    tramp += struct.pack("Q", python_function_addr)
  elif processor == "x86_64":
    # 0x0000000000000000:  49 BB aa aa aa aa aa aa aa aa    movabs r11, <address>
    # 0x000000000000000a:  41 FF E3                         jmp    r11
    tramp = b"\x49\xBB" + struct.pack("Q", python_function_addr) + b"\x41\xFF\xE3"
  else:
    raise Exception(f"processor {processor} not supported")

  # get real ioctl address
  ioctl_address = ctypes.cast(ctypes.byref(c_function), ctypes.POINTER(ctypes.c_ulong))

  # hook ioctl
  ret = libc.mprotect(ctypes.c_ulong((ioctl_address.contents.value//0x1000)*0x1000), 0x2000, 7)
  assert ret == 0
  ret = libc.mprotect(ctypes.c_ulong((ctypes.addressof(orig_func)//0x1000)*0x1000), 0x3000, 7)
  assert ret == 0
  libc.memcpy(orig_func, ioctl_address.contents, 0x1000)
  libc.memcpy(ioctl_address.contents, ctypes.create_string_buffer(tramp), len(tramp))
  return orig_func

# *** ioctl lib end ***
import tinygrad.runtime.autogen.nv_gpu as nv_gpu
nvescs = {getattr(nv_gpu, x):x for x in dir(nv_gpu) if x.startswith("NV_ESC")}
nvcmds = {getattr(nv_gpu, x):(x, getattr(nv_gpu, "struct_"+x+"_PARAMS", getattr(nv_gpu, "struct_"+x.replace("_CMD_", "_")+"_PARAMS", None))) for x in dir(nv_gpu) if \
          x.startswith("NV") and x[6:].startswith("_CTRL_") and isinstance(getattr(nv_gpu, x), int)}

def get_classes():
  hdrpy = (pathlib.Path(__file__).parent.parent.parent / "tinygrad/runtime/autogen/nv_gpu.py").read_text()
  clss = re.search(r'NV01_ROOT.*?NV_SEMAPHORE_SURFACE = \(0x000000da\) # macro', hdrpy, re.DOTALL).group()
  pattern = r'([0-9a-zA-Z_]*) = +\((0x[0-9a-fA-F]+)\)'
  matches = re.findall(pattern, clss, re.MULTILINE)
  return {int(num, base=16):name for name, num in matches}
nvclasses = get_classes()
nvuvms = {getattr(nv_gpu, x):x for x in dir(nv_gpu) if x.startswith("UVM_") and nv_gpu.__dict__.get(x+"_PARAMS")}
nvqcmds = {int(getattr(nv_gpu, x)):x for x in dir(nv_gpu) if x[:7] in {"NVC6C0_", "NVC56F_", "NVC6B5_"} and isinstance(getattr(nv_gpu, x), int)}

global_ioctl_id = 0
gpus_user_modes = []
gpus_mmio = []
gpus_fifo = []

@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_ulong, ctypes.c_void_p)
def ioctl(fd, request, argp):
  global global_ioctl_id, gpus_user_modes, gpus_mmio
  global_ioctl_id += 1
  st = time.perf_counter()
  ret = libc.syscall(IOCTL_SYSCALL, ctypes.c_int(fd), ctypes.c_ulong(request), ctypes.c_void_p(argp))
  et = time.perf_counter()-st
  fn = os.readlink(f"/proc/self/fd/{fd}")
  #print(f"ioctl {request:8x} {fn:20s}")
  idir, size, itype, nr = (request>>30), (request>>16)&0x3FFF, (request>>8)&0xFF, request&0xFF
  if getenv("IOCTL", 0) >= 1: print(f"#{global_ioctl_id}: ", end="")
  if itype == ord(nv_gpu.NV_IOCTL_MAGIC):
    if nr == nv_gpu.NV_ESC_RM_CONTROL:
      s = get_struct(argp, nv_gpu.NVOS54_PARAMETERS)
      if s.cmd in nvcmds:
        name, struc = nvcmds[s.cmd]
        if getenv("IOCTL", 0) >= 1:
          print(f"NV_ESC_RM_CONTROL    cmd={name:30s} hClient={s.hClient}, hObject={s.hObject}, flags={s.flags}, params={s.params}, paramsSize={s.paramsSize}, status={s.status}")

        if struc is not None: dump_struct(get_struct(s.params, struc))
        elif hasattr(nv_gpu, name+"_PARAMS"): dump_struct(get_struct(argp, getattr(nv_gpu, name+"_PARAMS")))
        elif name == "NVA06C_CTRL_CMD_GPFIFO_SCHEDULE": dump_struct(get_struct(argp, nv_gpu.NVA06C_CTRL_GPFIFO_SCHEDULE_PARAMS))
        elif name == "NV83DE_CTRL_CMD_GET_MAPPINGS": dump_struct(get_struct(s.params, nv_gpu.NV83DE_CTRL_DEBUG_GET_MAPPINGS_PARAMETERS))
      else:
        if getenv("IOCTL", 0) >= 1: print("unhandled cmd", hex(s.cmd))
      # format_struct(s)
      # print(f"{(st-start)*1000:7.2f} ms +{et*1000.:7.2f} ms : {ret:2d} = {name:40s}", ' '.join(format_struct(s)))
    elif nr == nv_gpu.NV_ESC_RM_ALLOC:
      s = get_struct(argp, nv_gpu.NVOS21_PARAMETERS)
      if getenv("IOCTL", 0) >= 1: print(f"NV_ESC_RM_ALLOC    hClass={nvclasses.get(s.hClass, f'unk=0x{s.hClass:X}'):30s}, hRoot={s.hRoot}, hObjectParent={s.hObjectParent}, pAllocParms={s.pAllocParms}, hObjectNew={s.hObjectNew} status={s.status}")
      if s.pAllocParms is not None:
        if s.hClass == nv_gpu.NV01_DEVICE_0: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV0080_ALLOC_PARAMETERS))
        if s.hClass == nv_gpu.FERMI_VASPACE_A: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS))
        if s.hClass == nv_gpu.NV50_MEMORY_VIRTUAL: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_MEMORY_ALLOCATION_PARAMS))
        if s.hClass == nv_gpu.NV1_MEMORY_USER: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_MEMORY_ALLOCATION_PARAMS))
        if s.hClass == nv_gpu.NV1_MEMORY_SYSTEM: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_MEMORY_ALLOCATION_PARAMS))
        if s.hClass == nv_gpu.GT200_DEBUGGER: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV83DE_ALLOC_PARAMETERS))
        if s.hClass == nv_gpu.AMPERE_CHANNEL_GPFIFO_A:
          sx = get_struct(s.pAllocParms, nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS)
          dump_struct(sx)
          gpus_fifo.append((sx.gpFifoOffset, sx.gpFifoEntries))
        if s.hClass == nv_gpu.KEPLER_CHANNEL_GROUP_A: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS))
      if s.hClass == nv_gpu.TURING_USERMODE_A: gpus_user_modes.append(s.hObjectNew)
    elif nr == nv_gpu.NV_ESC_RM_MAP_MEMORY:
      # nv_ioctl_nvos33_parameters_with_fd
      if getenv("IOCTL", 0) >= 1:
        s = get_struct(argp, nv_gpu.NVOS33_PARAMETERS)
        print(f"NV_ESC_RM_MAP_MEMORY   hClient={s.hClient}, hDevice={s.hDevice}, hMemory={s.hMemory}, length={s.length} flags={s.flags} pLinearAddress={s.pLinearAddress}")
    elif nr == nv_gpu.NV_ESC_RM_UPDATE_DEVICE_MAPPING_INFO:
      if getenv("IOCTL", 0) >= 1:
        s = get_struct(argp, nv_gpu.NVOS56_PARAMETERS)
        print(f"NV_ESC_RM_UPDATE_DEVICE_MAPPING_INFO   hClient={s.hClient}, hDevice={s.hDevice}, hMemory={s.hMemory}, pOldCpuAddress={s.pOldCpuAddress} pNewCpuAddress={s.pNewCpuAddress} status={s.status}")
    elif nr == nv_gpu.NV_ESC_RM_ALLOC_MEMORY:
      if getenv("IOCTL", 0) >= 1:
        s = get_struct(argp, nv_gpu.nv_ioctl_nvos02_parameters_with_fd)
        print(f"NV_ESC_RM_ALLOC_MEMORY  fd={s.fd}, hRoot={s.params.hRoot}, hObjectParent={s.params.hObjectParent}, hObjectNew={s.params.hObjectNew}, hClass={s.params.hClass}, flags={s.params.flags}, pMemory={s.params.pMemory}, limit={s.params.limit}, status={s.params.status}")
    elif nr == nv_gpu.NV_ESC_ALLOC_OS_EVENT:
      if getenv("IOCTL", 0) >= 1:
        s = get_struct(argp, nv_gpu.nv_ioctl_alloc_os_event_t)
        print(f"NV_ESC_ALLOC_OS_EVENT  hClient={s.hClient} hDevice={s.hDevice} fd={s.fd} Status={s.Status}")
    elif nr == nv_gpu.NV_ESC_REGISTER_FD:
      if getenv("IOCTL", 0) >= 1:
        s = get_struct(argp, nv_gpu.nv_ioctl_register_fd_t)
        print(f"NV_ESC_REGISTER_FD  fd={s.ctl_fd}")
    elif nr in nvescs:
      if getenv("IOCTL", 0) >= 1: print(nvescs[nr])
    else:
      if getenv("IOCTL", 0) >= 1: print("unhandled NR", nr)
  elif fn.endswith("nvidia-uvm"):
    if getenv("IOCTL", 0) >= 1:
      print(f"{nvuvms.get(request, f'UVM UNKNOWN {request=}')}")
      if nvuvms.get(request) is not None: dump_struct(get_struct(argp, getattr(nv_gpu, nvuvms.get(request)+"_PARAMS")))
      if nvuvms.get(request) == "UVM_MAP_EXTERNAL_ALLOCATION":
        st = get_struct(argp, getattr(nv_gpu, nvuvms.get(request)+"_PARAMS"))
        for i in range(st.gpuAttributesCount):
          print("perGpuAttributes[{i}] = ", end="")
          dump_struct(st.perGpuAttributes[i])

  if getenv("IOCTL") >= 2: print("ioctl", f"{idir=} {size=} {itype=} {nr=} {fd=} {ret=}", fn)
  return ret

@ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long)
def _mmap(addr, length, prot, flags, fd, offset):
  mmap_type = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long)
  orig_mmap = mmap_type(ctypes.addressof(orig_mmap_mv))
  ret = orig_mmap(addr, length, prot, flags, fd, offset)
  # ll = os.readlink(f"/proc/self/fd/{fd}") if fd >= 0 else ""
  print(f"mmap {addr=}, {length=}, {prot=}, {flags=}, {fd=}, {offset=} {ret=}")
  return ret

install_hook(libc.ioctl, ioctl)
if getenv("IOCTL") >= 3: orig_mmap_mv = install_hook(libc.mmap, _mmap)

import collections
old_gpputs = collections.defaultdict(int)
def _dump_gpfifo(mark):
  launches = []

  # print("_dump_gpfifo:", mark)
  for start, size in gpus_fifo:
    gpfifo_controls = nv_gpu.AmpereAControlGPFifo.from_address(start+size*8)
    gpfifo = to_mv(start, size * 8).cast("Q")
    while old_gpputs[start] != gpfifo_controls.GPPut:
      addr = ((gpfifo[old_gpputs[start]] & ((1 << 40)-1)) >> 2) << 2
      pckt_cnt = (gpfifo[old_gpputs[start]]>>42)&((1 << 20)-1)

      # print(f"\t{i}: 0x{gpfifo[i % size]:x}: addr:0x{addr:x} packets:{pckt_cnt} sync:{(gpfifo[i % size] >> 63) & 0x1} fetch:{gpfifo[i % size] & 0x1}")
      x = _dump_qmd(addr, pckt_cnt)
      if isinstance(x, list): launches += x
      old_gpputs[start] += 1
      old_gpputs[start] %= size
  return launches

import types
def make_qmd_struct_type():
  fields: List[Tuple[str, Union[Type[ctypes.c_uint64], Type[ctypes.c_uint32]], Any]] = []
  bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]
  bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]
  bits = sorted(bits, key=lambda x: x[1][1])
  for i,(name, data) in enumerate(bits):
    if i > 0 and (gap:=(data[1] - bits[i-1][1][0] - 1)) != 0: fields.append((f"_reserved{i}", ctypes.c_uint32, gap))
    fields.append((name.replace("NVC6C0_QMDV03_00_", "").lower(), ctypes.c_uint32, data[0]-data[1]+1))
    if len(fields) >= 2 and fields[-2][0].endswith('_lower') and fields[-1][0].endswith('_upper') and fields[-1][0][:-6] == fields[-2][0][:-6]:
      fields = fields[:-2] + [(fields[-1][0][:-6], ctypes.c_uint64, fields[-1][2] + fields[-2][2])]
  return init_c_struct_t(tuple(fields))
qmd_struct_t = make_qmd_struct_type()
assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4

def _dump_qmd(address, packets):
  qmds = []
  gpfifo = to_mv(address, packets * 4).cast("I")

  i = 0
  while i < packets:
    dat = gpfifo[i]
    typ = (dat>>28) & 0xF
    if typ == 0: break
    size = (dat>>16) & 0xFFF
    subc = (dat>>13) & 7
    mthd = (dat<<2) & 0x7FFF
    method_name = nvqcmds.get(mthd, f"unknown method #{mthd}")
    if getenv("IOCTL", 0) >= 1:
      print(f"\t\t{method_name}, {typ=} {size=} {subc=} {mthd=}")
      for j in range(size): print(f"\t\t\t{j}: {gpfifo[i+j+1]} | 0x{gpfifo[i+j+1]:x}")
    if mthd == 792:
      qmds.append(qmd_struct_t.from_address(address + 12 + i * 4))
    elif mthd == nv_gpu.NVC6C0_SEND_PCAS_A:
      qmds.append(qmd_struct_t.from_address(gpfifo[i+1] << 8))

    i += size + 1
  return qmds

# This is to be used in fuzzer, check cuda/nv side by side.
# Return a state which should be compare and compare function.
def before_launch(): _dump_gpfifo("before launch")
def collect_last_launch_state(): return _dump_gpfifo("after launch")

def compare_launch_state(states, good_states):
  states = states or list()
  good_states = good_states or list()
  if len(states) != 1 or len(good_states) != 1:
    return False, f"Some states not captured. {len(states)}!=1 || {len(good_states)}!=1"

  for i in range(len(states)):
    state, good_state = states[i], good_states[i]

    for n in ['qmd_major_version', 'invalidate_shader_data_cache', 'invalidate_shader_data_cache',
              'sm_global_caching_enable', 'invalidate_texture_header_cache', 'invalidate_texture_sampler_cache',
              'barrier_count', 'sampler_index', 'api_visible_call_limit', 'cwd_membar_type', 'sass_version',
              'max_sm_config_shared_mem_size', 'register_count_v', 'shared_memory_size']:
      if getattr(state, n) != getattr(good_state, n):
        return False, f"Field {n} mismatch: {getattr(state, n)} vs {getattr(good_state, n)}"

    # Allow NV to allocate more, at least this is not exact problem, so ignore it here.
    # Hmm, CUDA minimum is 0x640, is this hw-required minimum (will check)?
    if state.shader_local_memory_high_size < good_state.shader_local_memory_high_size and good_state.shader_local_memory_high_size > 0x640:
      return False, f"Field shader_local_memory_high_size mismatch: {state.shader_local_memory_high_size}vs{good_state.shader_local_memory_high_size}"

    # TODO: Can't request more, since it might not be optimal, but need to investigate their formula for this.. #7133
    if state.min_sm_config_shared_mem_size > good_state.min_sm_config_shared_mem_size and good_state.min_sm_config_shared_mem_size > 5:
      return (False,
        f"Field min_sm_config_shared_mem_size mismatch: {state.min_sm_config_shared_mem_size}vs{good_state.min_sm_config_shared_mem_size}")
    if state.target_sm_config_shared_mem_size > good_state.target_sm_config_shared_mem_size and good_state.target_sm_config_shared_mem_size > 5:
      return (False,
        f"Field target_sm_config_shared_mem_size mismatch: {state.target_sm_config_shared_mem_size}vs{good_state.target_sm_config_shared_mem_size}")

    for i in range(8):
      if i in {1, 7}: continue # shaders don't use that. what's cuda put here?
      n = f"constant_buffer_valid_{i}"
      if getattr(state, n) != getattr(good_state, n):
        return False, f"Field {n} mismatch: {getattr(state, n)} vs {getattr(good_state, n)}"

  return True, "PASS"

# IOCTL=1 PTX=1 CUDA=1 python3 test/test_ops.py TestOps.test_tiny_add
openpilot v0.10.0 2 months ago			`# type: ignore`
			`import ctypes, ctypes.util, struct, platform, pathlib, re, time, os, signal`
			`from tinygrad.helpers import from_mv, to_mv, getenv, init_c_struct_t`
			`from hexdump import hexdump`
			`start = time.perf_counter()`

			`# * ioctl lib *`
			`libc = ctypes.CDLL(ctypes.util.find_library("c"))`
			`processor = platform.processor()`
			`IOCTL_SYSCALL = {"aarch64": 0x1d, "x86_64":16}[processor]`
			`MMAP_SYSCALL = {"aarch64": 0xde, "x86_64":0x09}[processor]`

			`def get_struct(argp, stype):`
			`return ctypes.cast(ctypes.c_void_p(argp), ctypes.POINTER(stype)).contents`

			`def dump_struct(st):`
			`if getenv("IOCTL", 0) == 0: return`
			`print("\t", st.__class__.__name__, end=" { ")`
			`for v in type(st)._fields_: print(f"{v[0]}={getattr(st, v[0])}", end=" ")`
			`print("}")`

			`def format_struct(s):`
			`sdats = []`
			`for field in s._fields_:`
			`dat = getattr(s, field[0])`
			`if isinstance(dat, int): sdats.append(f"{field[0]}:0x{dat:X}")`
			`else: sdats.append(f"{field[0]}:{dat}")`
			`return sdats`

			`real_func_pool = {}`
			`def install_hook(c_function, python_function):`
			`orig_func = (ctypes.c_char*4096)()`
			`python_function_addr = ctypes.cast(ctypes.byref(python_function), ctypes.POINTER(ctypes.c_ulong)).contents.value`
			`# AARCH64 trampoline to ioctl`
			`if processor == "aarch64":`
			`# 0x0000000000000000: 70 00 00 10 adr x16, #0xc`
			`# 0x0000000000000004: 10 02 40 F9 ldr x16, [x16]`
			`# 0x0000000000000008: 00 02 1F D6 br x16`
			`tramp = b"\x70\x00\x00\x10\x10\x02\x40\xf9\x00\x02\x1f\xd6"`
			`tramp += struct.pack("Q", python_function_addr)`
			`elif processor == "x86_64":`
			`# 0x0000000000000000: 49 BB aa aa aa aa aa aa aa aa movabs r11, <address>`
			`# 0x000000000000000a: 41 FF E3 jmp r11`
			`tramp = b"\x49\xBB" + struct.pack("Q", python_function_addr) + b"\x41\xFF\xE3"`
			`else:`
			`raise Exception(f"processor {processor} not supported")`

			`# get real ioctl address`
			`ioctl_address = ctypes.cast(ctypes.byref(c_function), ctypes.POINTER(ctypes.c_ulong))`

			`# hook ioctl`
			`ret = libc.mprotect(ctypes.c_ulong((ioctl_address.contents.value//0x1000)*0x1000), 0x2000, 7)`
			`assert ret == 0`
			`ret = libc.mprotect(ctypes.c_ulong((ctypes.addressof(orig_func)//0x1000)*0x1000), 0x3000, 7)`
			`assert ret == 0`
			`libc.memcpy(orig_func, ioctl_address.contents, 0x1000)`
			`libc.memcpy(ioctl_address.contents, ctypes.create_string_buffer(tramp), len(tramp))`
			`return orig_func`

			`# * ioctl lib end *`
			`import tinygrad.runtime.autogen.nv_gpu as nv_gpu`
			`nvescs = {getattr(nv_gpu, x):x for x in dir(nv_gpu) if x.startswith("NV_ESC")}`
			`nvcmds = {getattr(nv_gpu, x):(x, getattr(nv_gpu, "struct_"+x+"_PARAMS", getattr(nv_gpu, "struct_"+x.replace("_CMD_", "_")+"_PARAMS", None))) for x in dir(nv_gpu) if \`
			`x.startswith("NV") and x[6:].startswith("_CTRL_") and isinstance(getattr(nv_gpu, x), int)}`

			`def get_classes():`
			`hdrpy = (pathlib.Path(__file__).parent.parent.parent / "tinygrad/runtime/autogen/nv_gpu.py").read_text()`
			`clss = re.search(r'NV01_ROOT.*?NV_SEMAPHORE_SURFACE = \(0x000000da\) # macro', hdrpy, re.DOTALL).group()`
			`pattern = r'([0-9a-zA-Z_]*) = +\((0x[0-9a-fA-F]+)\)'`
			`matches = re.findall(pattern, clss, re.MULTILINE)`
			`return {int(num, base=16):name for name, num in matches}`
			`nvclasses = get_classes()`
			`nvuvms = {getattr(nv_gpu, x):x for x in dir(nv_gpu) if x.startswith("UVM_") and nv_gpu.__dict__.get(x+"_PARAMS")}`
			`nvqcmds = {int(getattr(nv_gpu, x)):x for x in dir(nv_gpu) if x[:7] in {"NVC6C0_", "NVC56F_", "NVC6B5_"} and isinstance(getattr(nv_gpu, x), int)}`

			`global_ioctl_id = 0`
			`gpus_user_modes = []`
			`gpus_mmio = []`
			`gpus_fifo = []`

			`@ctypes.CFUNCTYPE(ctypes.c_int, ctypes.c_int, ctypes.c_ulong, ctypes.c_void_p)`
			`def ioctl(fd, request, argp):`
			`global global_ioctl_id, gpus_user_modes, gpus_mmio`
			`global_ioctl_id += 1`
			`st = time.perf_counter()`
			`ret = libc.syscall(IOCTL_SYSCALL, ctypes.c_int(fd), ctypes.c_ulong(request), ctypes.c_void_p(argp))`
			`et = time.perf_counter()-st`
			`fn = os.readlink(f"/proc/self/fd/{fd}")`
			`#print(f"ioctl {request:8x} {fn:20s}")`
			`idir, size, itype, nr = (request>>30), (request>>16)&0x3FFF, (request>>8)&0xFF, request&0xFF`
			`if getenv("IOCTL", 0) >= 1: print(f"#{global_ioctl_id}: ", end="")`
			`if itype == ord(nv_gpu.NV_IOCTL_MAGIC):`
			`if nr == nv_gpu.NV_ESC_RM_CONTROL:`
			`s = get_struct(argp, nv_gpu.NVOS54_PARAMETERS)`
			`if s.cmd in nvcmds:`
			`name, struc = nvcmds[s.cmd]`
			`if getenv("IOCTL", 0) >= 1:`
			`print(f"NV_ESC_RM_CONTROL cmd={name:30s} hClient={s.hClient}, hObject={s.hObject}, flags={s.flags}, params={s.params}, paramsSize={s.paramsSize}, status={s.status}")`

			`if struc is not None: dump_struct(get_struct(s.params, struc))`
			`elif hasattr(nv_gpu, name+"_PARAMS"): dump_struct(get_struct(argp, getattr(nv_gpu, name+"_PARAMS")))`
			`elif name == "NVA06C_CTRL_CMD_GPFIFO_SCHEDULE": dump_struct(get_struct(argp, nv_gpu.NVA06C_CTRL_GPFIFO_SCHEDULE_PARAMS))`
			`elif name == "NV83DE_CTRL_CMD_GET_MAPPINGS": dump_struct(get_struct(s.params, nv_gpu.NV83DE_CTRL_DEBUG_GET_MAPPINGS_PARAMETERS))`
			`else:`
			`if getenv("IOCTL", 0) >= 1: print("unhandled cmd", hex(s.cmd))`
			`# format_struct(s)`
			`# print(f"{(st-start)1000:7.2f} ms +{et1000.:7.2f} ms : {ret:2d} = {name:40s}", ' '.join(format_struct(s)))`
			`elif nr == nv_gpu.NV_ESC_RM_ALLOC:`
			`s = get_struct(argp, nv_gpu.NVOS21_PARAMETERS)`
			`if getenv("IOCTL", 0) >= 1: print(f"NV_ESC_RM_ALLOC hClass={nvclasses.get(s.hClass, f'unk=0x{s.hClass:X}'):30s}, hRoot={s.hRoot}, hObjectParent={s.hObjectParent}, pAllocParms={s.pAllocParms}, hObjectNew={s.hObjectNew} status={s.status}")`
			`if s.pAllocParms is not None:`
			`if s.hClass == nv_gpu.NV01_DEVICE_0: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV0080_ALLOC_PARAMETERS))`
			`if s.hClass == nv_gpu.FERMI_VASPACE_A: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_VASPACE_ALLOCATION_PARAMETERS))`
			`if s.hClass == nv_gpu.NV50_MEMORY_VIRTUAL: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_MEMORY_ALLOCATION_PARAMS))`
			`if s.hClass == nv_gpu.NV1_MEMORY_USER: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_MEMORY_ALLOCATION_PARAMS))`
			`if s.hClass == nv_gpu.NV1_MEMORY_SYSTEM: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_MEMORY_ALLOCATION_PARAMS))`
			`if s.hClass == nv_gpu.GT200_DEBUGGER: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV83DE_ALLOC_PARAMETERS))`
			`if s.hClass == nv_gpu.AMPERE_CHANNEL_GPFIFO_A:`
			`sx = get_struct(s.pAllocParms, nv_gpu.NV_CHANNELGPFIFO_ALLOCATION_PARAMETERS)`
			`dump_struct(sx)`
			`gpus_fifo.append((sx.gpFifoOffset, sx.gpFifoEntries))`
			`if s.hClass == nv_gpu.KEPLER_CHANNEL_GROUP_A: dump_struct(get_struct(s.pAllocParms, nv_gpu.NV_CHANNEL_GROUP_ALLOCATION_PARAMETERS))`
			`if s.hClass == nv_gpu.TURING_USERMODE_A: gpus_user_modes.append(s.hObjectNew)`
			`elif nr == nv_gpu.NV_ESC_RM_MAP_MEMORY:`
			`# nv_ioctl_nvos33_parameters_with_fd`
			`if getenv("IOCTL", 0) >= 1:`
			`s = get_struct(argp, nv_gpu.NVOS33_PARAMETERS)`
			`print(f"NV_ESC_RM_MAP_MEMORY hClient={s.hClient}, hDevice={s.hDevice}, hMemory={s.hMemory}, length={s.length} flags={s.flags} pLinearAddress={s.pLinearAddress}")`
			`elif nr == nv_gpu.NV_ESC_RM_UPDATE_DEVICE_MAPPING_INFO:`
			`if getenv("IOCTL", 0) >= 1:`
			`s = get_struct(argp, nv_gpu.NVOS56_PARAMETERS)`
			`print(f"NV_ESC_RM_UPDATE_DEVICE_MAPPING_INFO hClient={s.hClient}, hDevice={s.hDevice}, hMemory={s.hMemory}, pOldCpuAddress={s.pOldCpuAddress} pNewCpuAddress={s.pNewCpuAddress} status={s.status}")`
			`elif nr == nv_gpu.NV_ESC_RM_ALLOC_MEMORY:`
			`if getenv("IOCTL", 0) >= 1:`
			`s = get_struct(argp, nv_gpu.nv_ioctl_nvos02_parameters_with_fd)`
			`print(f"NV_ESC_RM_ALLOC_MEMORY fd={s.fd}, hRoot={s.params.hRoot}, hObjectParent={s.params.hObjectParent}, hObjectNew={s.params.hObjectNew}, hClass={s.params.hClass}, flags={s.params.flags}, pMemory={s.params.pMemory}, limit={s.params.limit}, status={s.params.status}")`
			`elif nr == nv_gpu.NV_ESC_ALLOC_OS_EVENT:`
			`if getenv("IOCTL", 0) >= 1:`
			`s = get_struct(argp, nv_gpu.nv_ioctl_alloc_os_event_t)`
			`print(f"NV_ESC_ALLOC_OS_EVENT hClient={s.hClient} hDevice={s.hDevice} fd={s.fd} Status={s.Status}")`
			`elif nr == nv_gpu.NV_ESC_REGISTER_FD:`
			`if getenv("IOCTL", 0) >= 1:`
			`s = get_struct(argp, nv_gpu.nv_ioctl_register_fd_t)`
			`print(f"NV_ESC_REGISTER_FD fd={s.ctl_fd}")`
			`elif nr in nvescs:`
			`if getenv("IOCTL", 0) >= 1: print(nvescs[nr])`
			`else:`
			`if getenv("IOCTL", 0) >= 1: print("unhandled NR", nr)`
			`elif fn.endswith("nvidia-uvm"):`
			`if getenv("IOCTL", 0) >= 1:`
			`print(f"{nvuvms.get(request, f'UVM UNKNOWN {request=}')}")`
			`if nvuvms.get(request) is not None: dump_struct(get_struct(argp, getattr(nv_gpu, nvuvms.get(request)+"_PARAMS")))`
			`if nvuvms.get(request) == "UVM_MAP_EXTERNAL_ALLOCATION":`
			`st = get_struct(argp, getattr(nv_gpu, nvuvms.get(request)+"_PARAMS"))`
			`for i in range(st.gpuAttributesCount):`
			`print("perGpuAttributes[{i}] = ", end="")`
			`dump_struct(st.perGpuAttributes[i])`

			`if getenv("IOCTL") >= 2: print("ioctl", f"{idir=} {size=} {itype=} {nr=} {fd=} {ret=}", fn)`
			`return ret`

			`@ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long)`
			`def _mmap(addr, length, prot, flags, fd, offset):`
			`mmap_type = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long)`
			`orig_mmap = mmap_type(ctypes.addressof(orig_mmap_mv))`
			`ret = orig_mmap(addr, length, prot, flags, fd, offset)`
			`# ll = os.readlink(f"/proc/self/fd/{fd}") if fd >= 0 else ""`
			`print(f"mmap {addr=}, {length=}, {prot=}, {flags=}, {fd=}, {offset=} {ret=}")`
			`return ret`

			`install_hook(libc.ioctl, ioctl)`
			`if getenv("IOCTL") >= 3: orig_mmap_mv = install_hook(libc.mmap, _mmap)`

			`import collections`
			`old_gpputs = collections.defaultdict(int)`
			`def _dump_gpfifo(mark):`
			`launches = []`

			`# print("_dump_gpfifo:", mark)`
			`for start, size in gpus_fifo:`
			`gpfifo_controls = nv_gpu.AmpereAControlGPFifo.from_address(start+size*8)`
			`gpfifo = to_mv(start, size * 8).cast("Q")`
			`while old_gpputs[start] != gpfifo_controls.GPPut:`
			`addr = ((gpfifo[old_gpputs[start]] & ((1 << 40)-1)) >> 2) << 2`
			`pckt_cnt = (gpfifo[old_gpputs[start]]>>42)&((1 << 20)-1)`

			`# print(f"\t{i}: 0x{gpfifo[i % size]:x}: addr:0x{addr:x} packets:{pckt_cnt} sync:{(gpfifo[i % size] >> 63) & 0x1} fetch:{gpfifo[i % size] & 0x1}")`
			`x = _dump_qmd(addr, pckt_cnt)`
			`if isinstance(x, list): launches += x`
			`old_gpputs[start] += 1`
			`old_gpputs[start] %= size`
			`return launches`

			`import types`
			`def make_qmd_struct_type():`
			`fields: List[Tuple[str, Union[Type[ctypes.c_uint64], Type[ctypes.c_uint32]], Any]] = []`
			`bits = [(name,dt) for name,dt in nv_gpu.__dict__.items() if name.startswith("NVC6C0_QMDV03_00") and isinstance(dt, tuple)]`
			`bits += [(name+f"_{i}",dt(i)) for name,dt in nv_gpu.__dict__.items() for i in range(8) if name.startswith("NVC6C0_QMDV03_00") and callable(dt)]`
			`bits = sorted(bits, key=lambda x: x[1][1])`
			`for i,(name, data) in enumerate(bits):`
			`if i > 0 and (gap:=(data[1] - bits[i-1][1][0] - 1)) != 0: fields.append((f"_reserved{i}", ctypes.c_uint32, gap))`
			`fields.append((name.replace("NVC6C0_QMDV03_00_", "").lower(), ctypes.c_uint32, data[0]-data[1]+1))`
			`if len(fields) >= 2 and fields[-2][0].endswith('_lower') and fields[-1][0].endswith('_upper') and fields[-1][0][:-6] == fields[-2][0][:-6]:`
			`fields = fields[:-2] + [(fields[-1][0][:-6], ctypes.c_uint64, fields[-1][2] + fields[-2][2])]`
			`return init_c_struct_t(tuple(fields))`
			`qmd_struct_t = make_qmd_struct_type()`
			`assert ctypes.sizeof(qmd_struct_t) == 0x40 * 4`

			`def _dump_qmd(address, packets):`
			`qmds = []`
			`gpfifo = to_mv(address, packets * 4).cast("I")`

			`i = 0`
			`while i < packets:`
			`dat = gpfifo[i]`
			`typ = (dat>>28) & 0xF`
			`if typ == 0: break`
			`size = (dat>>16) & 0xFFF`
			`subc = (dat>>13) & 7`
			`mthd = (dat<<2) & 0x7FFF`
			`method_name = nvqcmds.get(mthd, f"unknown method #{mthd}")`
			`if getenv("IOCTL", 0) >= 1:`
			`print(f"\t\t{method_name}, {typ=} {size=} {subc=} {mthd=}")`
			`for j in range(size): print(f"\t\t\t{j}: {gpfifo[i+j+1]} \| 0x{gpfifo[i+j+1]:x}")`
			`if mthd == 792:`
			`qmds.append(qmd_struct_t.from_address(address + 12 + i * 4))`
			`elif mthd == nv_gpu.NVC6C0_SEND_PCAS_A:`
			`qmds.append(qmd_struct_t.from_address(gpfifo[i+1] << 8))`

			`i += size + 1`
			`return qmds`

			`# This is to be used in fuzzer, check cuda/nv side by side.`
			`# Return a state which should be compare and compare function.`
			`def before_launch(): _dump_gpfifo("before launch")`
			`def collect_last_launch_state(): return _dump_gpfifo("after launch")`

			`def compare_launch_state(states, good_states):`
			`states = states or list()`
			`good_states = good_states or list()`
			`if len(states) != 1 or len(good_states) != 1:`
			`return False, f"Some states not captured. {len(states)}!=1 \|\| {len(good_states)}!=1"`

			`for i in range(len(states)):`
			`state, good_state = states[i], good_states[i]`

			`for n in ['qmd_major_version', 'invalidate_shader_data_cache', 'invalidate_shader_data_cache',`
			`'sm_global_caching_enable', 'invalidate_texture_header_cache', 'invalidate_texture_sampler_cache',`
			`'barrier_count', 'sampler_index', 'api_visible_call_limit', 'cwd_membar_type', 'sass_version',`
			`'max_sm_config_shared_mem_size', 'register_count_v', 'shared_memory_size']:`
			`if getattr(state, n) != getattr(good_state, n):`
			`return False, f"Field {n} mismatch: {getattr(state, n)} vs {getattr(good_state, n)}"`

			`# Allow NV to allocate more, at least this is not exact problem, so ignore it here.`
			`# Hmm, CUDA minimum is 0x640, is this hw-required minimum (will check)?`
			`if state.shader_local_memory_high_size < good_state.shader_local_memory_high_size and good_state.shader_local_memory_high_size > 0x640:`
			`return False, f"Field shader_local_memory_high_size mismatch: {state.shader_local_memory_high_size}vs{good_state.shader_local_memory_high_size}"`

			`# TODO: Can't request more, since it might not be optimal, but need to investigate their formula for this.. #7133`
			`if state.min_sm_config_shared_mem_size > good_state.min_sm_config_shared_mem_size and good_state.min_sm_config_shared_mem_size > 5:`
			`return (False,`
			`f"Field min_sm_config_shared_mem_size mismatch: {state.min_sm_config_shared_mem_size}vs{good_state.min_sm_config_shared_mem_size}")`
			`if state.target_sm_config_shared_mem_size > good_state.target_sm_config_shared_mem_size and good_state.target_sm_config_shared_mem_size > 5:`
			`return (False,`
			`f"Field target_sm_config_shared_mem_size mismatch: {state.target_sm_config_shared_mem_size}vs{good_state.target_sm_config_shared_mem_size}")`

			`for i in range(8):`
			`if i in {1, 7}: continue # shaders don't use that. what's cuda put here?`
			`n = f"constant_buffer_valid_{i}"`
			`if getattr(state, n) != getattr(good_state, n):`
			`return False, f"Field {n} mismatch: {getattr(state, n)} vs {getattr(good_state, n)}"`

			`return True, "PASS"`

			`# IOCTL=1 PTX=1 CUDA=1 python3 test/test_ops.py TestOps.test_tiny_add`