import pathlib, re, ctypes, mmap, collections, functools, copy, os import tinygrad.runtime.autogen.kfd as kfd import tinygrad.runtime.autogen.am.am as am from tinygrad.helpers import from_mv from test.mockgpu.driver import VirtDriver, VirtFileDesc, TextFileDesc, DirFileDesc, VirtFile from test.mockgpu.amd.amdgpu import AMDGPU, gpu_props libc = ctypes.CDLL(ctypes.util.find_library("c")) libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long] libc.mmap.restype = ctypes.c_void_p def ioctls_from_header(): # hdrpy = (pathlib.Path(__file__).parent.parent.parent.parent / "tinygrad" / "runtime" / "autogen" / "kfd.py").read_text() # pattern = r'# (AMDKFD_IOC_[A-Z0-9_]+)\s=\s_(IOW?R?).*\(( 0x[0-9a-fA-F]+) ,\s+struct\s([A-Za-z0-9_]+)\s+\)' # matches = re.findall(pattern, hdrpy, re.MULTILINE) hdr = (pathlib.Path(__file__).parent.parent.parent.parent / "extra" / "hip_gpu_driver" / "kfd_ioctl.h").read_text().replace("\\\n", "") pattern = r'#define\s+(AMDKFD_IOC_[A-Z0-9_]+)\s+AMDKFD_(IOW?R?)\((0x[0-9a-fA-F]+),\s+struct\s([A-Za-z0-9_]+)\)' matches = re.findall(pattern, hdr, re.MULTILINE) return type("KFD_IOCTLS", (object, ), {name: int(nr, 0x10) for name, _, nr, _ in matches}), \ {int(nr, 0x10): getattr(kfd, "struct_"+sname) for name, idir, nr, sname in matches} kfd_ioctls, kfd_headers = ioctls_from_header() class KFDFileDesc(VirtFileDesc): def __init__(self, fd, driver): super().__init__(fd) self.driver = driver def ioctl(self, fd, request, argp): return self.driver.kfd_ioctl(request, argp) def mmap(self, start, sz, prot, flags, fd, offset): return offset class DRMFileDesc(VirtFileDesc): def __init__(self, fd, driver, gpu): super().__init__(fd) self.driver, self.gpu = driver, gpu def mmap(self, start, sz, prot, flags, fd, offset): return libc.mmap(start, sz, prot, flags|mmap.MAP_ANONYMOUS, -1, 0) class AMDDriver(VirtDriver): def __init__(self, gpus=6): super().__init__() self.tracked_files += [VirtFile('/dev/kfd', functools.partial(KFDFileDesc, driver=self))] + \ [VirtFile('/sys/devices/virtual/kfd/kfd/topology/nodes', functools.partial(DirFileDesc, child_names=[str(i) for i in range(gpus)]))] self.gpus = {} self.next_fd = (1 << 30) self.next_handle = 1 self.next_event = 1 self.object_by_handle = {} self.doorbells = {} self.next_doorbell = collections.defaultdict(int) self.mmu_event_ids = [] for i in range(gpus): self._prepare_gpu(i) def _alloc_fd(self): my_fd = self.next_fd self.next_fd = self.next_fd + 1 return my_fd def _alloc_handle(self): handle = self.next_handle self.next_handle += 1 return handle def _alloc_next_event_slot(self): ev = self.next_event self.next_event += 1 return ev def _alloc_doorbell(self, gpu_id): x = ctypes.addressof(from_mv(self.doorbells[gpu_id])) + self.next_doorbell[gpu_id] * 8 self.next_doorbell[gpu_id] += 1 return x def _prepare_gpu(self, gpu_id): self.doorbells[gpu_id] = memoryview(bytearray(0x2000)) self.gpus[gpu_id] = AMDGPU(gpu_id) self.tracked_files += [ VirtFile('/sys/module/amdgpu', functools.partial(TextFileDesc, text="1")), VirtFile(f'/sys/devices/virtual/kfd/kfd/topology/nodes/{gpu_id}', functools.partial(DirFileDesc, child_names=['gpu_id', 'properties'])), VirtFile(f'/sys/devices/virtual/kfd/kfd/topology/nodes/{gpu_id}/gpu_id', functools.partial(TextFileDesc, text=f"{gpu_id}")), VirtFile(f'/sys/devices/virtual/kfd/kfd/topology/nodes/{gpu_id}/properties', functools.partial(TextFileDesc, text=gpu_props.format(drm_render_minor=gpu_id))), VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0', functools.partial(DirFileDesc, child_names=[str(am.GC_HWID), str(am.SDMA0_HWID), str(am.NBIF_HWID)])), VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.GC_HWID}/0/major', functools.partial(TextFileDesc, text='11')), VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.GC_HWID}/0/minor', functools.partial(TextFileDesc, text='0')), VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.GC_HWID}/0/revision', functools.partial(TextFileDesc, text='0')), VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.GC_HWID}/0/base_addr', functools.partial(TextFileDesc, text='0x00001260\n0x0000A000\n0x0001C000\n0x02402C00')), VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.SDMA0_HWID}/0/major', functools.partial(TextFileDesc, text='6')), VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.SDMA0_HWID}/0/minor', functools.partial(TextFileDesc, text='0')), VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.SDMA0_HWID}/0/revision', functools.partial(TextFileDesc, text='0')), VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.SDMA0_HWID}/0/base_addr', functools.partial(TextFileDesc, text='0x00001260\n0x0000A000\n0x0001C000\n0x02402C00')), VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.NBIF_HWID}/0/major', functools.partial(TextFileDesc, text='4')), VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.NBIF_HWID}/0/minor', functools.partial(TextFileDesc, text='3')), VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.NBIF_HWID}/0/revision', functools.partial(TextFileDesc, text='0')), VirtFile(f'/sys/class/drm/renderD{gpu_id}/device/ip_discovery/die/0/{am.NBIF_HWID}/0/base_addr', functools.partial(TextFileDesc, text='0x00000000\n0x00000014\n0x00000D20\n0x00010400\n0x0241B000\n0x04040000')), VirtFile(f'/dev/dri/renderD{gpu_id}', functools.partial(DRMFileDesc, driver=self, gpu=f"{self.gpus[gpu_id]}")), ] def open(self, name, flags, mode, virtfile): return virtfile.fdcls(self._alloc_fd()) def kfd_ioctl(self, req, argp): nr = req & 0xFF struct = kfd_headers[nr].from_address(argp) if nr == kfd_ioctls.AMDKFD_IOC_ACQUIRE_VM: pass elif nr == kfd_ioctls.AMDKFD_IOC_ALLOC_MEMORY_OF_GPU: if struct.gpu_id not in self.gpus: return -1 struct.handle = self._alloc_handle() self.object_by_handle[struct.handle] = copy.deepcopy(struct) # save memory struct to know what mem it is elif nr == kfd_ioctls.AMDKFD_IOC_FREE_MEMORY_OF_GPU: self.object_by_handle.pop(struct.handle) elif nr == kfd_ioctls.AMDKFD_IOC_MAP_MEMORY_TO_GPU: dev_ids = (ctypes.c_int32 * struct.n_devices).from_address(struct.device_ids_array_ptr) for i in range(struct.n_devices): gpu = self.gpus[dev_ids[i]] mem_obj = self.object_by_handle[struct.handle] gpu.map_range(mem_obj.va_addr, mem_obj.size) struct.n_success = i + 1 elif nr == kfd_ioctls.AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU: dev_ids = (ctypes.c_int32 * struct.n_devices).from_address(struct.device_ids_array_ptr) for i in range(struct.n_devices): gpu = self.gpus[dev_ids[i]] mem_obj = self.object_by_handle[struct.handle] gpu.unmap_range(mem_obj.va_addr, mem_obj.size) struct.n_success = i + 1 elif nr == kfd_ioctls.AMDKFD_IOC_CREATE_EVENT: struct.event_slot_index = self._alloc_next_event_slot() struct.event_id = struct.event_slot_index if struct.event_type == kfd.KFD_IOC_EVENT_MEMORY: self.mmu_event_ids.append(struct.event_id) elif nr == kfd_ioctls.AMDKFD_IOC_CREATE_QUEUE: gpu = self.gpus[struct.gpu_id] if struct.queue_type == kfd.KFD_IOC_QUEUE_TYPE_SDMA: gpu.add_sdma_queue(struct.ring_base_address, struct.ring_size, struct.read_pointer_address, struct.write_pointer_address) elif struct.queue_type == kfd.KFD_IOC_QUEUE_TYPE_COMPUTE: gpu.add_pm4_queue(struct.ring_base_address, struct.ring_size, struct.read_pointer_address, struct.write_pointer_address) else: raise RuntimeError("Unsuported, queue") # Track writes to doorbell, calling callback struct.doorbell_offset = self._alloc_doorbell(struct.gpu_id) self.track_address(struct.doorbell_offset, struct.doorbell_offset + 8, lambda mv,off: None, lambda mv, off: self._emulate_execute()) elif nr == kfd_ioctls.AMDKFD_IOC_WAIT_EVENTS: evs = (kfd.struct_kfd_event_data * struct.num_events).from_address(struct.events_ptr) for ev in evs: if ev.event_id in self.mmu_event_ids and "MOCKGPU_EMU_FAULTADDR" in os.environ: ev.memory_exception_data.gpu_id = 1 ev.memory_exception_data.va = int(os.environ["MOCKGPU_EMU_FAULTADDR"], 16) ev.memory_exception_data.failure.NotPresent = 1 else: name = "unknown" for k,v in kfd_ioctls.__dict__.items(): if nr == v: name = k assert False, f"unknown kfd ioctl, {nr} {name}" exit(1) return 0 def _emulate_execute(self): any_progress = True while any_progress: any_progress = False for gpu in self.gpus.values(): for q in gpu.queues: if (prev_rptr:=q.rptr[0]) != q.wptr[0]: q.execute() any_progress |= (prev_rptr != q.rptr[0])