openpilot_comma/tinygrad_repo/extra/hook_cuda.py

import ctypes, struct, platform, pathlib, os, binascii
from hexdump import hexdump
from tinygrad.helpers import to_mv, DEBUG, getenv
from tinygrad.runtime.autogen import libc, cuda
from tinygrad.device import CPUProgram
from tinygrad.runtime.support.elf import elf_loader
from tinygrad.runtime.ops_cuda import cu_time_execution

def _hook(fxn_address_value, tramp):
  page_address = (fxn_address_value//0x1000)*0x1000
  ret = libc.mprotect(page_address, 0x2000, 7)
  assert ret == 0
  libc.memcpy(fxn_address_value, tramp, len(tramp))
  ret = libc.mprotect(page_address, 0x2000, 5)
  assert ret == 0
  CPUProgram.rt_lib["__clear_cache"](fxn_address_value, fxn_address_value + len(tramp))

def install_hook(c_function, python_function):
  python_function_addr = ctypes.cast(ctypes.byref(python_function), ctypes.POINTER(ctypes.c_ulong)).contents.value
  # AARCH64 trampoline to ioctl
  if (processor:=platform.processor()) == "aarch64":
    # 0x0000000000000000:  70 00 00 10    adr x16, #0xc
    # 0x0000000000000004:  10 02 40 F9    ldr x16, [x16]
    # 0x0000000000000008:  00 02 1F D6    br  x16
    tramp = b"\x70\x00\x00\x10\x10\x02\x40\xf9\x00\x02\x1f\xd6"
    tramp += struct.pack("Q", python_function_addr)
  elif processor == "x86_64":
    # 0x0000000000000000:  49 BB aa aa aa aa aa aa aa aa    movabs r11, <address>
    # 0x000000000000000a:  41 FF E3                         jmp    r11
    tramp = b"\x49\xBB" + struct.pack("Q", python_function_addr) + b"\x41\xFF\xE3"
  else:
    raise Exception(f"processor {processor} not supported")
  tramp = ctypes.create_string_buffer(tramp)

  # get real function address
  fxn_address = ctypes.cast(ctypes.byref(c_function), ctypes.POINTER(ctypes.c_ulong))
  fxn_address_value = fxn_address.contents.value
  #print(f"** hooking function at 0x{fxn_address_value}")

  orig_save = (ctypes.c_char*len(tramp))()
  libc.memcpy(orig_save, fxn_address_value, len(tramp))
  _hook(fxn_address_value, tramp)

  def original(*args):
    _hook(fxn_address_value, orig_save)
    ret = c_function(*args)
    _hook(fxn_address_value, tramp)
    return ret
  return original

hooked = {}

allocated_memory = {}
function_names = {}

seen_modules = set()

@ctypes.CFUNCTYPE(ctypes.c_int)
def dummy():
  print("**** dummy function hook ****")
  return -1

@ctypes.CFUNCTYPE(*([cuda.cuInit.restype] + cuda.cuInit.argtypes))
def cuInit(flags):
  print("call cuInit", flags)
  return hooked["cuInit"](flags)

@ctypes.CFUNCTYPE(*([cuda.cuMemHostAlloc.restype] + cuda.cuMemHostAlloc.argtypes))
def cuMemHostAlloc(pp, bytesize, flags):
  print(f"cuMemHostAlloc {bytesize}")
  return hooked["cuMemHostAlloc"](pp, bytesize, flags)

@ctypes.CFUNCTYPE(*([cuda.cuModuleLoadData.restype] + cuda.cuModuleLoadData.argtypes))
def cuModuleLoadData(module, image):
  ret = hooked["cuModuleLoadData"](module, image)
  module_address = ctypes.addressof(module.contents.contents)
  print(f"cuModuleLoadData 0x{image:x} -> 0x{module_address:X}")
  seen_modules.add(module_address)

  #images, sections, relocs = elf_loader(bytes(to_mv(image, 0x100000)))
  #for s in sections: print(s)

  #print('\n'.join([x for x in maps.split("\n") if 'libcuda' in x]))

  #hexdump(to_mv(image, 0x1000))
  #image, sections, relocs = elf_loader(to_mv(image))
  #print(sections)
  return ret

@ctypes.CFUNCTYPE(*([cuda.cuModuleGetFunction.restype] + cuda.cuModuleGetFunction.argtypes))
def cuModuleGetFunction(hfunc, hmod, name):
  ret = hooked["cuModuleGetFunction"](hfunc, hmod, name)
  python_name = ctypes.string_at(name).decode()

  # pip install git+https://github.com/wbenny/pydemangler.git
  import pydemangler
  demangled_name = pydemangler.demangle(python_name)
  if demangled_name is not None: python_name = demangled_name

  print(f"called cuModuleGetFunction 0x{ctypes.addressof(hmod.contents):X} {python_name}")
  function_names[ctypes.addressof(hfunc.contents.contents)] = python_name
  return ret

@ctypes.CFUNCTYPE(*([cuda.cuMemAlloc_v2.restype] + cuda.cuMemAlloc_v2.argtypes))
def cuMemAlloc_v2(dptr, bytesize):
  ret = hooked["cuMemAlloc_v2"](dptr, bytesize)
  cuda_address = ctypes.addressof(dptr.contents)
  allocated_memory[cuda_address] = bytesize
  print(f"cuMemAlloc_v2 {bytesize} 0x{cuda_address:X}")
  return ret

@ctypes.CFUNCTYPE(*([cuda.cuLaunchKernel.restype] + cuda.cuLaunchKernel.argtypes))
def cuLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra):
  tm = cu_time_execution(lambda:
    hooked["cuLaunchKernel"](f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra), True)

  name = function_names[ctypes.addressof(f.contents)]
  print(f"{tm*1e6:9.2f} us -- cuLaunchKernel <<{gridDimX:6d}, {gridDimY:5d}, {gridDimZ:5d}>>",
    f"<<{blockDimX:4d}, {blockDimY:4d}, {blockDimZ:4d}>> {sharedMemBytes} {name}")

  if extra: hexdump(to_mv(extra, 0x100))

  if getenv("PARAMS") and kernelParams:
    #print(f"params @ 0x{ctypes.addressof(kernelParams.contents):X}")
    params = []
    while True:
      ret = cuda.cuFuncGetParamInfo(f, len(params), ctypes.byref(paramOffset:=ctypes.c_size_t()), ctypes.byref(paramSize:=ctypes.c_size_t()))
      if ret != 0: break
      params.append((paramOffset.value, paramSize.value))
    #params_dat = to_mv(kernelParams.contents, params[-1][0] + params[-1][1])
    params_ptr = to_mv(kernelParams, len(params)*8).cast("Q")
    #params_dat = to_mv(kernelParams.contents, params[-1][0] + params[-1][1])
    for i,(off,sz) in enumerate(params):
      hexdump(to_mv(params_ptr[i], sz))


    #hexdump(params_dat)
    #for i,(off,sz) in enumerate(params):
    #  print(f"{i}: offset:{off:3d} size:{sz:3d}") # --", binascii.hexlify(dat).decode())
    #  hexdump(params_dat[off:off+sz])
    #if name == "exp2_kernel_vectorized4_kernel":
    #  ptr_0 = struct.unpack("Q", params_dat[0x10:0x18])[0]
    #  hexdump(to_mv(ptr_0, 0x80))
    #ptr_1 = struct.unpack("Q", to_mv(ptr_0, 8))[0]

  #print(f"params 0x{ctypes.addressof(kernelParams):X}")
  #hexdump(to_mv(kernelParams, 0x100))
  #print(f"data 0x{to_mv(kernelParams, 8).cast('Q')[0]:X}")
  #hexdump(to_mv(kernelParams.contents, 0x80))
  #for i,addr in enumerate(to_mv(kernelParams.contents, 0x100).cast("Q")): print(f"{i*8:3d}: {addr:X}")

  return 0

if __name__ == "__main__":
  #out = cuda.CUmoduleLoadingMode()
  #print(cuda.cuModuleGetLoadingMode(ctypes.byref(out)))
  #print(out.value)

  hooked['cuInit'] = install_hook(cuda.cuInit, cuInit)
  hooked['cuModuleGetFunction'] = install_hook(cuda.cuModuleGetFunction, cuModuleGetFunction)
  hooked['cuLaunchKernel'] = install_hook(cuda.cuLaunchKernel, cuLaunchKernel)

  # memory stuff
  hooked['cuMemAlloc_v2'] = install_hook(cuda.cuMemAlloc_v2, cuMemAlloc_v2)
  hooked['cuMemHostAlloc'] = install_hook(cuda.cuMemHostAlloc, cuMemHostAlloc)

  # module loading + not used module loading
  hooked['cuModuleLoadData'] = install_hook(cuda.cuModuleLoadData, cuModuleLoadData)
  install_hook(cuda.cuModuleLoad, dummy)
  install_hook(cuda.cuModuleLoadDataEx, dummy)
  install_hook(cuda.cuModuleLoadFatBinary, dummy)

  # library stuff (doesn't seem used)
  #install_hook(cuda.cuLibraryLoadData, dummy)
  #install_hook(cuda.cuLibraryLoadFromFile, dummy)
  #install_hook(cuda.cuLibraryGetModule, dummy)

  #install_hook(cuda.cuMemAllocManaged, dummy)

  # unused
  #install_hook(cuda.cuFuncGetModule, dummy)
  #install_hook(cuda.cuModuleGetGlobal_v2, dummy)

  # hook v1
  #install_hook(cuda._libraries['libcuda.so'].cuModuleGetGlobal, dummy)
  #install_hook(cuda._libraries['libcuda.so'].cuMemAlloc, dummy)
  #install_hook(cuda._libraries['libcuda.so'].cuLinkComplete, dummy)

  #nvjitlink = ctypes.CDLL("/home/tiny/.local/lib/python3.10/site-packages/nvidia/nvjitlink/lib/libnvJitLink.so.12")
  #install_hook(nvjitlink.nvJitLinkCreate, dummy)
  #nvrtc = ctypes.CDLL("/home/tiny/.local/lib/python3.10/site-packages/nvidia/cuda_nvrtc/lib/libnvrtc.so.11.2")
  #nvrtc = ctypes.CDLL("/usr/local/cuda-12.4/targets/x86_64-linux/lib/libnvrtc.so.12.4.127")
  #from tinygrad.runtime.autogen import nvrtc
  #install_hook(nvrtc.nvrtcCreateProgram, dummy)
  #install_hook(nvrtc.nvJitLinkCreate, dummy)

  #import tinygrad.runtime.autogen.nvrtc as nvrtc
  #install_hook(nvrtc.nvJitLinkCreate, dummy)
  #install_hook(nvrtc.nvrtcCreateProgram, dummy)

  #hooked['cuLinkCreate'] = install_hook(cuda.cuLinkCreate, dummy)

  if getenv("TINYGRAD"):
    from tinygrad import Tensor
    (Tensor.zeros(6, device="CUDA").contiguous()*2).realize()
    exit(0)

  print("importing torch...")
  import torch
  print("torch", torch.__version__, torch.__file__)

  if getenv("RESNET"):
    import torchvision.models as models
    model = models.resnet18(pretrained=True)
    model = model.cuda()
    model.eval()

    if getenv("COMPILE"): model = torch.compile(model)

    X = torch.rand(getenv("BS", 1), 3, 288, 288, device='cuda')
    model(X)

    print("\n\n\n****** second run ******\n")
    model(X)
  else:
    a = torch.zeros(4, 4).cuda()
    b = torch.zeros(4, 4).cuda()
    print("tensor created")
    print(f"a: 0x{a.data_ptr():X}")
    print(f"b: 0x{b.data_ptr():X}")
    a += 1
    b += 2
    a = a.exp2()
    b = b.exp2()
    a += b
    #c = a @ b
    print("tensor math done", a.cpu().numpy())

  # confirm cuda library is right
  #maps = pathlib.Path("/proc/self/maps").read_text()
  #print('\n'.join([x for x in maps.split("\n") if 'cuda' in x or 'nv' in x]))
openpilot v0.10.2 release date: 2025-11-04T10:25:11 master commit: 5198b1b079c37742c1050f02ce0aa6dd42b038b9 19 hours ago			`import ctypes, struct, platform, pathlib, os, binascii`
			`from hexdump import hexdump`
			`from tinygrad.helpers import to_mv, DEBUG, getenv`
			`from tinygrad.runtime.autogen import libc, cuda`
			`from tinygrad.device import CPUProgram`
			`from tinygrad.runtime.support.elf import elf_loader`
			`from tinygrad.runtime.ops_cuda import cu_time_execution`

			`def _hook(fxn_address_value, tramp):`
			`page_address = (fxn_address_value//0x1000)*0x1000`
			`ret = libc.mprotect(page_address, 0x2000, 7)`
			`assert ret == 0`
			`libc.memcpy(fxn_address_value, tramp, len(tramp))`
			`ret = libc.mprotect(page_address, 0x2000, 5)`
			`assert ret == 0`
			`CPUProgram.rt_lib["__clear_cache"](fxn_address_value, fxn_address_value + len(tramp))`

			`def install_hook(c_function, python_function):`
			`python_function_addr = ctypes.cast(ctypes.byref(python_function), ctypes.POINTER(ctypes.c_ulong)).contents.value`
			`# AARCH64 trampoline to ioctl`
			`if (processor:=platform.processor()) == "aarch64":`
			`# 0x0000000000000000: 70 00 00 10 adr x16, #0xc`
			`# 0x0000000000000004: 10 02 40 F9 ldr x16, [x16]`
			`# 0x0000000000000008: 00 02 1F D6 br x16`
			`tramp = b"\x70\x00\x00\x10\x10\x02\x40\xf9\x00\x02\x1f\xd6"`
			`tramp += struct.pack("Q", python_function_addr)`
			`elif processor == "x86_64":`
			`# 0x0000000000000000: 49 BB aa aa aa aa aa aa aa aa movabs r11, <address>`
			`# 0x000000000000000a: 41 FF E3 jmp r11`
			`tramp = b"\x49\xBB" + struct.pack("Q", python_function_addr) + b"\x41\xFF\xE3"`
			`else:`
			`raise Exception(f"processor {processor} not supported")`
			`tramp = ctypes.create_string_buffer(tramp)`

			`# get real function address`
			`fxn_address = ctypes.cast(ctypes.byref(c_function), ctypes.POINTER(ctypes.c_ulong))`
			`fxn_address_value = fxn_address.contents.value`
			`#print(f"** hooking function at 0x{fxn_address_value}")`

			`orig_save = (ctypes.c_char*len(tramp))()`
			`libc.memcpy(orig_save, fxn_address_value, len(tramp))`
			`_hook(fxn_address_value, tramp)`

			`def original(*args):`
			`_hook(fxn_address_value, orig_save)`
			`ret = c_function(*args)`
			`_hook(fxn_address_value, tramp)`
			`return ret`
			`return original`

			`hooked = {}`

			`allocated_memory = {}`
			`function_names = {}`

			`seen_modules = set()`

			`@ctypes.CFUNCTYPE(ctypes.c_int)`
			`def dummy():`
			`print("** dummy function hook **")`
			`return -1`

			`@ctypes.CFUNCTYPE(*([cuda.cuInit.restype] + cuda.cuInit.argtypes))`
			`def cuInit(flags):`
			`print("call cuInit", flags)`
			`return hooked["cuInit"](flags)`

			`@ctypes.CFUNCTYPE(*([cuda.cuMemHostAlloc.restype] + cuda.cuMemHostAlloc.argtypes))`
			`def cuMemHostAlloc(pp, bytesize, flags):`
			`print(f"cuMemHostAlloc {bytesize}")`
			`return hooked["cuMemHostAlloc"](pp, bytesize, flags)`

			`@ctypes.CFUNCTYPE(*([cuda.cuModuleLoadData.restype] + cuda.cuModuleLoadData.argtypes))`
			`def cuModuleLoadData(module, image):`
			`ret = hooked["cuModuleLoadData"](module, image)`
			`module_address = ctypes.addressof(module.contents.contents)`
			`print(f"cuModuleLoadData 0x{image:x} -> 0x{module_address:X}")`
			`seen_modules.add(module_address)`

			`#images, sections, relocs = elf_loader(bytes(to_mv(image, 0x100000)))`
			`#for s in sections: print(s)`

			`#print('\n'.join([x for x in maps.split("\n") if 'libcuda' in x]))`

			`#hexdump(to_mv(image, 0x1000))`
			`#image, sections, relocs = elf_loader(to_mv(image))`
			`#print(sections)`
			`return ret`

			`@ctypes.CFUNCTYPE(*([cuda.cuModuleGetFunction.restype] + cuda.cuModuleGetFunction.argtypes))`
			`def cuModuleGetFunction(hfunc, hmod, name):`
			`ret = hooked["cuModuleGetFunction"](hfunc, hmod, name)`
			`python_name = ctypes.string_at(name).decode()`

			`# pip install git+https://github.com/wbenny/pydemangler.git`
			`import pydemangler`
			`demangled_name = pydemangler.demangle(python_name)`
			`if demangled_name is not None: python_name = demangled_name`

			`print(f"called cuModuleGetFunction 0x{ctypes.addressof(hmod.contents):X} {python_name}")`
			`function_names[ctypes.addressof(hfunc.contents.contents)] = python_name`
			`return ret`

			`@ctypes.CFUNCTYPE(*([cuda.cuMemAlloc_v2.restype] + cuda.cuMemAlloc_v2.argtypes))`
			`def cuMemAlloc_v2(dptr, bytesize):`
			`ret = hooked["cuMemAlloc_v2"](dptr, bytesize)`
			`cuda_address = ctypes.addressof(dptr.contents)`
			`allocated_memory[cuda_address] = bytesize`
			`print(f"cuMemAlloc_v2 {bytesize} 0x{cuda_address:X}")`
			`return ret`

			`@ctypes.CFUNCTYPE(*([cuda.cuLaunchKernel.restype] + cuda.cuLaunchKernel.argtypes))`
			`def cuLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra):`
			`tm = cu_time_execution(lambda:`
			`hooked["cuLaunchKernel"](f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra), True)`

			`name = function_names[ctypes.addressof(f.contents)]`
			`print(f"{tm*1e6:9.2f} us -- cuLaunchKernel <<{gridDimX:6d}, {gridDimY:5d}, {gridDimZ:5d}>>",`
			`f"<<{blockDimX:4d}, {blockDimY:4d}, {blockDimZ:4d}>> {sharedMemBytes} {name}")`

			`if extra: hexdump(to_mv(extra, 0x100))`

			`if getenv("PARAMS") and kernelParams:`
			`#print(f"params @ 0x{ctypes.addressof(kernelParams.contents):X}")`
			`params = []`
			`while True:`
			`ret = cuda.cuFuncGetParamInfo(f, len(params), ctypes.byref(paramOffset:=ctypes.c_size_t()), ctypes.byref(paramSize:=ctypes.c_size_t()))`
			`if ret != 0: break`
			`params.append((paramOffset.value, paramSize.value))`
			`#params_dat = to_mv(kernelParams.contents, params[-1][0] + params[-1][1])`
			`params_ptr = to_mv(kernelParams, len(params)*8).cast("Q")`
			`#params_dat = to_mv(kernelParams.contents, params[-1][0] + params[-1][1])`
			`for i,(off,sz) in enumerate(params):`
			`hexdump(to_mv(params_ptr[i], sz))`


			`#hexdump(params_dat)`
			`#for i,(off,sz) in enumerate(params):`
			`# print(f"{i}: offset:{off:3d} size:{sz:3d}") # --", binascii.hexlify(dat).decode())`
			`# hexdump(params_dat[off:off+sz])`
			`#if name == "exp2_kernel_vectorized4_kernel":`
			`# ptr_0 = struct.unpack("Q", params_dat[0x10:0x18])[0]`
			`# hexdump(to_mv(ptr_0, 0x80))`
			`#ptr_1 = struct.unpack("Q", to_mv(ptr_0, 8))[0]`

			`#print(f"params 0x{ctypes.addressof(kernelParams):X}")`
			`#hexdump(to_mv(kernelParams, 0x100))`
			`#print(f"data 0x{to_mv(kernelParams, 8).cast('Q')[0]:X}")`
			`#hexdump(to_mv(kernelParams.contents, 0x80))`
			`#for i,addr in enumerate(to_mv(kernelParams.contents, 0x100).cast("Q")): print(f"{i*8:3d}: {addr:X}")`

			`return 0`

			`if __name__ == "__main__":`
			`#out = cuda.CUmoduleLoadingMode()`
			`#print(cuda.cuModuleGetLoadingMode(ctypes.byref(out)))`
			`#print(out.value)`

			`hooked['cuInit'] = install_hook(cuda.cuInit, cuInit)`
			`hooked['cuModuleGetFunction'] = install_hook(cuda.cuModuleGetFunction, cuModuleGetFunction)`
			`hooked['cuLaunchKernel'] = install_hook(cuda.cuLaunchKernel, cuLaunchKernel)`

			`# memory stuff`
			`hooked['cuMemAlloc_v2'] = install_hook(cuda.cuMemAlloc_v2, cuMemAlloc_v2)`
			`hooked['cuMemHostAlloc'] = install_hook(cuda.cuMemHostAlloc, cuMemHostAlloc)`

			`# module loading + not used module loading`
			`hooked['cuModuleLoadData'] = install_hook(cuda.cuModuleLoadData, cuModuleLoadData)`
			`install_hook(cuda.cuModuleLoad, dummy)`
			`install_hook(cuda.cuModuleLoadDataEx, dummy)`
			`install_hook(cuda.cuModuleLoadFatBinary, dummy)`

			`# library stuff (doesn't seem used)`
			`#install_hook(cuda.cuLibraryLoadData, dummy)`
			`#install_hook(cuda.cuLibraryLoadFromFile, dummy)`
			`#install_hook(cuda.cuLibraryGetModule, dummy)`

			`#install_hook(cuda.cuMemAllocManaged, dummy)`

			`# unused`
			`#install_hook(cuda.cuFuncGetModule, dummy)`
			`#install_hook(cuda.cuModuleGetGlobal_v2, dummy)`

			`# hook v1`
			`#install_hook(cuda._libraries['libcuda.so'].cuModuleGetGlobal, dummy)`
			`#install_hook(cuda._libraries['libcuda.so'].cuMemAlloc, dummy)`
			`#install_hook(cuda._libraries['libcuda.so'].cuLinkComplete, dummy)`

			`#nvjitlink = ctypes.CDLL("/home/tiny/.local/lib/python3.10/site-packages/nvidia/nvjitlink/lib/libnvJitLink.so.12")`
			`#install_hook(nvjitlink.nvJitLinkCreate, dummy)`
			`#nvrtc = ctypes.CDLL("/home/tiny/.local/lib/python3.10/site-packages/nvidia/cuda_nvrtc/lib/libnvrtc.so.11.2")`
			`#nvrtc = ctypes.CDLL("/usr/local/cuda-12.4/targets/x86_64-linux/lib/libnvrtc.so.12.4.127")`
			`#from tinygrad.runtime.autogen import nvrtc`
			`#install_hook(nvrtc.nvrtcCreateProgram, dummy)`
			`#install_hook(nvrtc.nvJitLinkCreate, dummy)`

			`#import tinygrad.runtime.autogen.nvrtc as nvrtc`
			`#install_hook(nvrtc.nvJitLinkCreate, dummy)`
			`#install_hook(nvrtc.nvrtcCreateProgram, dummy)`

			`#hooked['cuLinkCreate'] = install_hook(cuda.cuLinkCreate, dummy)`

			`if getenv("TINYGRAD"):`
			`from tinygrad import Tensor`
			`(Tensor.zeros(6, device="CUDA").contiguous()*2).realize()`
			`exit(0)`

			`print("importing torch...")`
			`import torch`
			`print("torch", torch.__version__, torch.__file__)`

			`if getenv("RESNET"):`
			`import torchvision.models as models`
			`model = models.resnet18(pretrained=True)`
			`model = model.cuda()`
			`model.eval()`

			`if getenv("COMPILE"): model = torch.compile(model)`

			`X = torch.rand(getenv("BS", 1), 3, 288, 288, device='cuda')`
			`model(X)`

			`print("\n\n\n**** second run ****\n")`
			`model(X)`
			`else:`
			`a = torch.zeros(4, 4).cuda()`
			`b = torch.zeros(4, 4).cuda()`
			`print("tensor created")`
			`print(f"a: 0x{a.data_ptr():X}")`
			`print(f"b: 0x{b.data_ptr():X}")`
			`a += 1`
			`b += 2`
			`a = a.exp2()`
			`b = b.exp2()`
			`a += b`
			`#c = a @ b`
			`print("tensor math done", a.cpu().numpy())`

			`# confirm cuda library is right`
			`#maps = pathlib.Path("/proc/self/maps").read_text()`
			`#print('\n'.join([x for x in maps.split("\n") if 'cuda' in x or 'nv' in x]))`