You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
242 lines
9.2 KiB
242 lines
9.2 KiB
7 hours ago
|
import ctypes, struct, platform, pathlib, os, binascii
|
||
|
from hexdump import hexdump
|
||
|
from tinygrad.helpers import to_mv, DEBUG, getenv
|
||
|
from tinygrad.runtime.autogen import libc, cuda
|
||
|
from tinygrad.device import CPUProgram
|
||
|
from tinygrad.runtime.support.elf import elf_loader
|
||
|
from tinygrad.runtime.ops_cuda import cu_time_execution
|
||
|
|
||
|
def _hook(fxn_address_value, tramp):
|
||
|
page_address = (fxn_address_value//0x1000)*0x1000
|
||
|
ret = libc.mprotect(page_address, 0x2000, 7)
|
||
|
assert ret == 0
|
||
|
libc.memcpy(fxn_address_value, tramp, len(tramp))
|
||
|
ret = libc.mprotect(page_address, 0x2000, 5)
|
||
|
assert ret == 0
|
||
|
CPUProgram.rt_lib["__clear_cache"](fxn_address_value, fxn_address_value + len(tramp))
|
||
|
|
||
|
def install_hook(c_function, python_function):
|
||
|
python_function_addr = ctypes.cast(ctypes.byref(python_function), ctypes.POINTER(ctypes.c_ulong)).contents.value
|
||
|
# AARCH64 trampoline to ioctl
|
||
|
if (processor:=platform.processor()) == "aarch64":
|
||
|
# 0x0000000000000000: 70 00 00 10 adr x16, #0xc
|
||
|
# 0x0000000000000004: 10 02 40 F9 ldr x16, [x16]
|
||
|
# 0x0000000000000008: 00 02 1F D6 br x16
|
||
|
tramp = b"\x70\x00\x00\x10\x10\x02\x40\xf9\x00\x02\x1f\xd6"
|
||
|
tramp += struct.pack("Q", python_function_addr)
|
||
|
elif processor == "x86_64":
|
||
|
# 0x0000000000000000: 49 BB aa aa aa aa aa aa aa aa movabs r11, <address>
|
||
|
# 0x000000000000000a: 41 FF E3 jmp r11
|
||
|
tramp = b"\x49\xBB" + struct.pack("Q", python_function_addr) + b"\x41\xFF\xE3"
|
||
|
else:
|
||
|
raise Exception(f"processor {processor} not supported")
|
||
|
tramp = ctypes.create_string_buffer(tramp)
|
||
|
|
||
|
# get real function address
|
||
|
fxn_address = ctypes.cast(ctypes.byref(c_function), ctypes.POINTER(ctypes.c_ulong))
|
||
|
fxn_address_value = fxn_address.contents.value
|
||
|
#print(f"** hooking function at 0x{fxn_address_value}")
|
||
|
|
||
|
orig_save = (ctypes.c_char*len(tramp))()
|
||
|
libc.memcpy(orig_save, fxn_address_value, len(tramp))
|
||
|
_hook(fxn_address_value, tramp)
|
||
|
|
||
|
def original(*args):
|
||
|
_hook(fxn_address_value, orig_save)
|
||
|
ret = c_function(*args)
|
||
|
_hook(fxn_address_value, tramp)
|
||
|
return ret
|
||
|
return original
|
||
|
|
||
|
hooked = {}
|
||
|
|
||
|
allocated_memory = {}
|
||
|
function_names = {}
|
||
|
|
||
|
seen_modules = set()
|
||
|
|
||
|
@ctypes.CFUNCTYPE(ctypes.c_int)
|
||
|
def dummy():
|
||
|
print("**** dummy function hook ****")
|
||
|
return -1
|
||
|
|
||
|
@ctypes.CFUNCTYPE(*([cuda.cuInit.restype] + cuda.cuInit.argtypes))
|
||
|
def cuInit(flags):
|
||
|
print("call cuInit", flags)
|
||
|
return hooked["cuInit"](flags)
|
||
|
|
||
|
@ctypes.CFUNCTYPE(*([cuda.cuMemHostAlloc.restype] + cuda.cuMemHostAlloc.argtypes))
|
||
|
def cuMemHostAlloc(pp, bytesize, flags):
|
||
|
print(f"cuMemHostAlloc {bytesize}")
|
||
|
return hooked["cuMemHostAlloc"](pp, bytesize, flags)
|
||
|
|
||
|
@ctypes.CFUNCTYPE(*([cuda.cuModuleLoadData.restype] + cuda.cuModuleLoadData.argtypes))
|
||
|
def cuModuleLoadData(module, image):
|
||
|
ret = hooked["cuModuleLoadData"](module, image)
|
||
|
module_address = ctypes.addressof(module.contents.contents)
|
||
|
print(f"cuModuleLoadData 0x{image:x} -> 0x{module_address:X}")
|
||
|
seen_modules.add(module_address)
|
||
|
|
||
|
#images, sections, relocs = elf_loader(bytes(to_mv(image, 0x100000)))
|
||
|
#for s in sections: print(s)
|
||
|
|
||
|
#print('\n'.join([x for x in maps.split("\n") if 'libcuda' in x]))
|
||
|
|
||
|
#hexdump(to_mv(image, 0x1000))
|
||
|
#image, sections, relocs = elf_loader(to_mv(image))
|
||
|
#print(sections)
|
||
|
return ret
|
||
|
|
||
|
@ctypes.CFUNCTYPE(*([cuda.cuModuleGetFunction.restype] + cuda.cuModuleGetFunction.argtypes))
|
||
|
def cuModuleGetFunction(hfunc, hmod, name):
|
||
|
ret = hooked["cuModuleGetFunction"](hfunc, hmod, name)
|
||
|
python_name = ctypes.string_at(name).decode()
|
||
|
|
||
|
# pip install git+https://github.com/wbenny/pydemangler.git
|
||
|
import pydemangler
|
||
|
demangled_name = pydemangler.demangle(python_name)
|
||
|
if demangled_name is not None: python_name = demangled_name
|
||
|
|
||
|
print(f"called cuModuleGetFunction 0x{ctypes.addressof(hmod.contents):X} {python_name}")
|
||
|
function_names[ctypes.addressof(hfunc.contents.contents)] = python_name
|
||
|
return ret
|
||
|
|
||
|
@ctypes.CFUNCTYPE(*([cuda.cuMemAlloc_v2.restype] + cuda.cuMemAlloc_v2.argtypes))
|
||
|
def cuMemAlloc_v2(dptr, bytesize):
|
||
|
ret = hooked["cuMemAlloc_v2"](dptr, bytesize)
|
||
|
cuda_address = ctypes.addressof(dptr.contents)
|
||
|
allocated_memory[cuda_address] = bytesize
|
||
|
print(f"cuMemAlloc_v2 {bytesize} 0x{cuda_address:X}")
|
||
|
return ret
|
||
|
|
||
|
@ctypes.CFUNCTYPE(*([cuda.cuLaunchKernel.restype] + cuda.cuLaunchKernel.argtypes))
|
||
|
def cuLaunchKernel(f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra):
|
||
|
tm = cu_time_execution(lambda:
|
||
|
hooked["cuLaunchKernel"](f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemBytes, hStream, kernelParams, extra), True)
|
||
|
|
||
|
name = function_names[ctypes.addressof(f.contents)]
|
||
|
print(f"{tm*1e6:9.2f} us -- cuLaunchKernel <<{gridDimX:6d}, {gridDimY:5d}, {gridDimZ:5d}>>",
|
||
|
f"<<{blockDimX:4d}, {blockDimY:4d}, {blockDimZ:4d}>> {sharedMemBytes} {name}")
|
||
|
|
||
|
if extra: hexdump(to_mv(extra, 0x100))
|
||
|
|
||
|
if getenv("PARAMS") and kernelParams:
|
||
|
#print(f"params @ 0x{ctypes.addressof(kernelParams.contents):X}")
|
||
|
params = []
|
||
|
while True:
|
||
|
ret = cuda.cuFuncGetParamInfo(f, len(params), ctypes.byref(paramOffset:=ctypes.c_size_t()), ctypes.byref(paramSize:=ctypes.c_size_t()))
|
||
|
if ret != 0: break
|
||
|
params.append((paramOffset.value, paramSize.value))
|
||
|
#params_dat = to_mv(kernelParams.contents, params[-1][0] + params[-1][1])
|
||
|
params_ptr = to_mv(kernelParams, len(params)*8).cast("Q")
|
||
|
#params_dat = to_mv(kernelParams.contents, params[-1][0] + params[-1][1])
|
||
|
for i,(off,sz) in enumerate(params):
|
||
|
hexdump(to_mv(params_ptr[i], sz))
|
||
|
|
||
|
|
||
|
#hexdump(params_dat)
|
||
|
#for i,(off,sz) in enumerate(params):
|
||
|
# print(f"{i}: offset:{off:3d} size:{sz:3d}") # --", binascii.hexlify(dat).decode())
|
||
|
# hexdump(params_dat[off:off+sz])
|
||
|
#if name == "exp2_kernel_vectorized4_kernel":
|
||
|
# ptr_0 = struct.unpack("Q", params_dat[0x10:0x18])[0]
|
||
|
# hexdump(to_mv(ptr_0, 0x80))
|
||
|
#ptr_1 = struct.unpack("Q", to_mv(ptr_0, 8))[0]
|
||
|
|
||
|
#print(f"params 0x{ctypes.addressof(kernelParams):X}")
|
||
|
#hexdump(to_mv(kernelParams, 0x100))
|
||
|
#print(f"data 0x{to_mv(kernelParams, 8).cast('Q')[0]:X}")
|
||
|
#hexdump(to_mv(kernelParams.contents, 0x80))
|
||
|
#for i,addr in enumerate(to_mv(kernelParams.contents, 0x100).cast("Q")): print(f"{i*8:3d}: {addr:X}")
|
||
|
|
||
|
return 0
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
#out = cuda.CUmoduleLoadingMode()
|
||
|
#print(cuda.cuModuleGetLoadingMode(ctypes.byref(out)))
|
||
|
#print(out.value)
|
||
|
|
||
|
hooked['cuInit'] = install_hook(cuda.cuInit, cuInit)
|
||
|
hooked['cuModuleGetFunction'] = install_hook(cuda.cuModuleGetFunction, cuModuleGetFunction)
|
||
|
hooked['cuLaunchKernel'] = install_hook(cuda.cuLaunchKernel, cuLaunchKernel)
|
||
|
|
||
|
# memory stuff
|
||
|
hooked['cuMemAlloc_v2'] = install_hook(cuda.cuMemAlloc_v2, cuMemAlloc_v2)
|
||
|
hooked['cuMemHostAlloc'] = install_hook(cuda.cuMemHostAlloc, cuMemHostAlloc)
|
||
|
|
||
|
# module loading + not used module loading
|
||
|
hooked['cuModuleLoadData'] = install_hook(cuda.cuModuleLoadData, cuModuleLoadData)
|
||
|
install_hook(cuda.cuModuleLoad, dummy)
|
||
|
install_hook(cuda.cuModuleLoadDataEx, dummy)
|
||
|
install_hook(cuda.cuModuleLoadFatBinary, dummy)
|
||
|
|
||
|
# library stuff (doesn't seem used)
|
||
|
#install_hook(cuda.cuLibraryLoadData, dummy)
|
||
|
#install_hook(cuda.cuLibraryLoadFromFile, dummy)
|
||
|
#install_hook(cuda.cuLibraryGetModule, dummy)
|
||
|
|
||
|
#install_hook(cuda.cuMemAllocManaged, dummy)
|
||
|
|
||
|
# unused
|
||
|
#install_hook(cuda.cuFuncGetModule, dummy)
|
||
|
#install_hook(cuda.cuModuleGetGlobal_v2, dummy)
|
||
|
|
||
|
# hook v1
|
||
|
#install_hook(cuda._libraries['libcuda.so'].cuModuleGetGlobal, dummy)
|
||
|
#install_hook(cuda._libraries['libcuda.so'].cuMemAlloc, dummy)
|
||
|
#install_hook(cuda._libraries['libcuda.so'].cuLinkComplete, dummy)
|
||
|
|
||
|
#nvjitlink = ctypes.CDLL("/home/tiny/.local/lib/python3.10/site-packages/nvidia/nvjitlink/lib/libnvJitLink.so.12")
|
||
|
#install_hook(nvjitlink.nvJitLinkCreate, dummy)
|
||
|
#nvrtc = ctypes.CDLL("/home/tiny/.local/lib/python3.10/site-packages/nvidia/cuda_nvrtc/lib/libnvrtc.so.11.2")
|
||
|
#nvrtc = ctypes.CDLL("/usr/local/cuda-12.4/targets/x86_64-linux/lib/libnvrtc.so.12.4.127")
|
||
|
#from tinygrad.runtime.autogen import nvrtc
|
||
|
#install_hook(nvrtc.nvrtcCreateProgram, dummy)
|
||
|
#install_hook(nvrtc.nvJitLinkCreate, dummy)
|
||
|
|
||
|
#import tinygrad.runtime.autogen.nvrtc as nvrtc
|
||
|
#install_hook(nvrtc.nvJitLinkCreate, dummy)
|
||
|
#install_hook(nvrtc.nvrtcCreateProgram, dummy)
|
||
|
|
||
|
#hooked['cuLinkCreate'] = install_hook(cuda.cuLinkCreate, dummy)
|
||
|
|
||
|
if getenv("TINYGRAD"):
|
||
|
from tinygrad import Tensor
|
||
|
(Tensor.zeros(6, device="CUDA").contiguous()*2).realize()
|
||
|
exit(0)
|
||
|
|
||
|
print("importing torch...")
|
||
|
import torch
|
||
|
print("torch", torch.__version__, torch.__file__)
|
||
|
|
||
|
if getenv("RESNET"):
|
||
|
import torchvision.models as models
|
||
|
model = models.resnet18(pretrained=True)
|
||
|
model = model.cuda()
|
||
|
model.eval()
|
||
|
|
||
|
if getenv("COMPILE"): model = torch.compile(model)
|
||
|
|
||
|
X = torch.rand(getenv("BS", 1), 3, 288, 288, device='cuda')
|
||
|
model(X)
|
||
|
|
||
|
print("\n\n\n****** second run ******\n")
|
||
|
model(X)
|
||
|
else:
|
||
|
a = torch.zeros(4, 4).cuda()
|
||
|
b = torch.zeros(4, 4).cuda()
|
||
|
print("tensor created")
|
||
|
print(f"a: 0x{a.data_ptr():X}")
|
||
|
print(f"b: 0x{b.data_ptr():X}")
|
||
|
a += 1
|
||
|
b += 2
|
||
|
a = a.exp2()
|
||
|
b = b.exp2()
|
||
|
a += b
|
||
|
#c = a @ b
|
||
|
print("tensor math done", a.cpu().numpy())
|
||
|
|
||
|
# confirm cuda library is right
|
||
|
#maps = pathlib.Path("/proc/self/maps").read_text()
|
||
|
#print('\n'.join([x for x in maps.split("\n") if 'cuda' in x or 'nv' in x]))
|