from __future__ import annotations import ctypes, ctypes.util, functools from tinygrad.helpers import DEBUG, getenv, from_mv, init_c_var, init_c_struct_t from tinygrad.device import Compiled, BufferSpec, LRUAllocator from tinygrad.renderer.cstyle import CUDARenderer from tinygrad.renderer.ptx import PTXRenderer from tinygrad.runtime.autogen import cuda from tinygrad.runtime.support.compiler_cuda import cuda_disassemble, pretty_ptx, CUDACompiler, PTXCompiler, PTX if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import def check(status): if status != 0: raise RuntimeError(f"CUDA Error {status}, {ctypes.string_at(init_c_var(ctypes.POINTER(ctypes.c_char)(), lambda x: cuda.cuGetErrorString(status, ctypes.byref(x)))).decode()}") # noqa: E501 def encode_args(args, vals) -> tuple[ctypes.Structure, ctypes.Array]: c_args = init_c_struct_t(tuple([(f'f{i}', cuda.CUdeviceptr_v2) for i in range(len(args))] + [(f'v{i}', ctypes.c_int) for i in range(len(vals))]))(*args, *vals) vargs = (ctypes.c_void_p * 5)(ctypes.c_void_p(1), ctypes.cast(ctypes.byref(c_args), ctypes.c_void_p), ctypes.c_void_p(2), ctypes.cast(ctypes.pointer(ctypes.c_size_t(ctypes.sizeof(c_args))), ctypes.c_void_p), ctypes.c_void_p(0)) return c_args, vargs def cu_time_execution(cb, enable=False) -> float|None: if not enable: return cb() evs = [init_c_var(cuda.CUevent(), lambda x: cuda.cuEventCreate(ctypes.byref(x), 0)) for _ in range(2)] cuda.cuEventRecord(evs[0], None) cb() cuda.cuEventRecord(evs[1], None) check(cuda.cuEventSynchronize(evs[1])) cuda.cuEventElapsedTime(ctypes.byref(ret := ctypes.c_float()), evs[0], evs[1]) for ev in evs: cuda.cuEventDestroy_v2(ev) return ret.value * 1e-3 class CUDAProgram: def __init__(self, dev:CUDADevice, name:str, lib:bytes, smem:int=0): self.dev, self.name, self.lib, self.smem = dev, name, lib, smem if DEBUG >= 5: print("\n".join([f"{i+1:>3} {line}" for i, line in enumerate(pretty_ptx(lib.decode('utf-8')).split("\n"))])) if DEBUG >= 6: cuda_disassemble(lib, dev.arch) check(cuda.cuCtxSetCurrent(self.dev.context)) self.module = cuda.CUmodule() status = cuda.cuModuleLoadData(ctypes.byref(self.module), lib) if status != 0: del self.module cuda_disassemble(lib, dev.arch) raise RuntimeError(f"module load failed with status code {status}: {cuda.cudaError_enum__enumvalues[status]}") check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8"))) self.prg = prg if self.smem > 0: check(cuda.cuFuncSetAttribute(self.prg, cuda.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, self.smem)) def __del__(self): if hasattr(self, 'module'): check(cuda.cuModuleUnload(self.module)) def __call__(self, *args, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False): check(cuda.cuCtxSetCurrent(self.dev.context)) if not hasattr(self, "vargs"): self.c_args, self.vargs = encode_args(args, vals) else: for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i]) for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i]) return cu_time_execution(lambda: check(cuda.cuLaunchKernel(self.prg, *global_size, *local_size, self.smem, None, None, self.vargs)), enable=wait) class CUDAAllocator(LRUAllocator): def __init__(self, device:CUDADevice): self.device = device super().__init__() def _alloc(self, size, options:BufferSpec): check(cuda.cuCtxSetCurrent(self.device.context)) if options.host: return init_c_var(ctypes.c_void_p(), lambda x: check(cuda.cuMemHostAlloc(ctypes.byref(x), size, 0x01))) return init_c_var(cuda.CUdeviceptr(), lambda x: check(cuda.cuMemAlloc_v2(ctypes.byref(x), size))) def _free(self, opaque, options:BufferSpec): if options.host: check(cuda.cuMemFreeHost(opaque)) else: check(cuda.cuMemFree_v2(opaque)) def _copyin(self, dest, src:memoryview): check(cuda.cuCtxSetCurrent(self.device.context)) host_mem = self.alloc(len(src), BufferSpec(host=True)) self.device.pending_copyin.append((host_mem, len(src), BufferSpec(host=True))) ctypes.memmove(host_mem, from_mv(src), len(src)) check(cuda.cuMemcpyHtoDAsync_v2(dest, host_mem, len(src), None)) def _copyout(self, dest:memoryview, src): CUDADevice.synchronize_system() check(cuda.cuCtxSetCurrent(self.device.context)) check(cuda.cuMemcpyDtoH_v2(from_mv(dest), src, len(dest))) def _transfer(self, dest, src, sz:int, src_dev, dest_dev): check(cuda.cuCtxSetCurrent(src_dev.context)) check(cuda.cuEventCreate(ctypes.byref(sync_event := cuda.CUevent()), 0)) check(cuda.cuMemcpyDtoDAsync_v2(dest, src, sz, None)) check(cuda.cuEventRecord(sync_event, None)) check(cuda.cuCtxSetCurrent(dest_dev.context)) check(cuda.cuStreamWaitEvent(None, sync_event, 0)) # sync the default stream on the dest dev def _offset(self, buf, size:int, offset:int): return cuda.CUdeviceptr_v2(buf.value + offset) class CUDADevice(Compiled): devices: list[CUDADevice] = [] peer_access = False def __init__(self, device:str): device_id = int(device.split(":")[1]) if ":" in device else 0 check(cuda.cuInit(0)) self.cu_device = init_c_var(cuda.CUdevice(), lambda x: check(cuda.cuDeviceGet(ctypes.byref(x), device_id))) self.context = init_c_var(cuda.CUcontext(), lambda x: check(cuda.cuCtxCreate_v2(ctypes.byref(x), 0, self.cu_device))) check(cuda.cuDeviceComputeCapability(ctypes.byref(major := ctypes.c_int()), ctypes.byref(minor := ctypes.c_int()), device_id)) for dev in CUDADevice.devices: check(cuda.cuDeviceCanAccessPeer(ctypes.byref(val := ctypes.c_int()), self.cu_device, dev.cu_device)) if val.value != 1: continue check(cuda.cuCtxSetCurrent(dev.context)) check(cuda.cuCtxEnablePeerAccess(self.context, 0)) check(cuda.cuCtxSetCurrent(self.context)) check(cuda.cuCtxEnablePeerAccess(dev.context, 0)) CUDADevice.peer_access = True self.arch = f"sm_{major.value}{minor.value}" self.pending_copyin: list[tuple[int, int, BufferSpec|None]] = [] CUDADevice.devices.append(self) from tinygrad.runtime.graph.cuda import CUDAGraph super().__init__(device, CUDAAllocator(self), PTXRenderer(self.arch) if PTX else CUDARenderer(self.arch), PTXCompiler(self.arch) if PTX else CUDACompiler(self.arch), functools.partial(CUDAProgram, self), graph=CUDAGraph) def synchronize(self): check(cuda.cuCtxSetCurrent(self.context)) check(cuda.cuCtxSynchronize()) for opaque,sz,options in self.pending_copyin: self.allocator.free(opaque, sz, options) self.pending_copyin.clear() @staticmethod def synchronize_system(): for d in CUDADevice.devices: d.synchronize()