You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
104 lines
6.6 KiB
104 lines
6.6 KiB
from __future__ import annotations
|
|
import platform, subprocess, sys, ctypes, functools, time
|
|
from tinygrad.helpers import capstone_flatdump, getenv, from_mv, to_mv, OSX, mv_address, round_up, wait_cond
|
|
from tinygrad.device import Compiler, BufferSpec, DMACPURef
|
|
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocatorBase, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram, MMIOInterface
|
|
from tinygrad.runtime.support.elf import jit_loader
|
|
from tinygrad.renderer.cstyle import ClangRenderer
|
|
from tinygrad.uop.ops import sint
|
|
|
|
class ClangJITCompiler(Compiler):
|
|
def __init__(self, cachekey="compile_clang_jit"): super().__init__(cachekey)
|
|
|
|
def compile(self, src:str) -> bytes:
|
|
# -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
|
|
# x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
|
|
target = 'x86_64' if sys.platform == 'win32' else platform.machine()
|
|
args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
|
|
arch_args = ['-ffixed-x18'] if target == 'arm64' else []
|
|
obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
|
|
return jit_loader(obj)
|
|
|
|
def disassemble(self, lib:bytes): return capstone_flatdump(lib)
|
|
|
|
class CPUComputeQueue(HWQueue):
|
|
def _exec(self, prg, bufs, *args):
|
|
prg.fxn(*map(ctypes.c_uint64, args[:bufs]), *map(ctypes.c_int64 if platform.machine() == "arm64" else ctypes.c_int32, args[bufs:]))
|
|
def _signal(self, signal_addr, value): to_mv(signal_addr, 4).cast('I')[0] = value
|
|
def _wait(self, signal_addr, value): wait_cond(lambda: to_mv(signal_addr, 4).cast('I')[0] >= value, timeout_ms=60000)
|
|
def _timestamp(self, timestamp_addr): to_mv(timestamp_addr, 8).cast('Q')[0] = time.perf_counter_ns()
|
|
def cmd(self, cmd, *args):
|
|
self.q(cmd, len(args), *args)
|
|
return self
|
|
|
|
def memory_barrier(self): return self
|
|
def exec(self, prg:CPUProgram, args_state:HCQArgsState, global_size, local_size):
|
|
return self.cmd(self._exec, prg, len(args_state.bufs), *[x.va_addr for x in args_state.bufs], *args_state.vals)
|
|
def wait(self, signal, value=0): return self.cmd(self._wait, signal.value_addr, value)
|
|
def timestamp(self, signal): return self.cmd(self._timestamp, signal.timestamp_addr)
|
|
def signal(self, signal, value:sint=0): return self.cmd(self._signal, signal.value_addr, value)
|
|
|
|
def _submit(self, dev):
|
|
# Execute the commands in the queue: fn, argc, args...
|
|
off = 0
|
|
while off < len(self._q):
|
|
self._q[off](*self._q[off + 2:off + 2 + self._q[off + 1]])
|
|
off += self._q[off + 1] + 2
|
|
|
|
# NOTE: MAP_JIT is added to mmap module in python 3.13
|
|
MAP_JIT = 0x0800
|
|
|
|
class CPUProgram(HCQProgram):
|
|
rt_lib = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'kernel32') if OSX or sys.platform == "win32" else 'libgcc_s.so.1')
|
|
|
|
def __init__(self, dev, name:str, lib:bytes):
|
|
if sys.platform == "win32":
|
|
PAGE_EXECUTE_READWRITE, MEM_COMMIT, MEM_RESERVE = 0x40, 0x1000, 0x2000
|
|
ctypes.windll.kernel32.VirtualAlloc.restype = ctypes.c_void_p
|
|
self.mem = ctypes.windll.kernel32.VirtualAlloc(ctypes.c_void_p(0), ctypes.c_size_t(len(lib)), MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE)
|
|
ctypes.memmove(self.mem, lib, len(lib))
|
|
ctypes.windll.kernel32.GetCurrentProcess.restype = ctypes.c_void_p
|
|
proc = ctypes.windll.kernel32.GetCurrentProcess()
|
|
ctypes.windll.kernel32.FlushInstructionCache(ctypes.c_void_p(proc), ctypes.c_void_p(self.mem), ctypes.c_size_t(len(lib)))
|
|
self.fxn = ctypes.CFUNCTYPE(None)(self.mem)
|
|
else:
|
|
from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
|
|
# On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
|
|
# MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
|
|
self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
|
|
|
|
if OSX: CPUProgram.rt_lib.pthread_jit_write_protect_np(False)
|
|
self.mem.write(lib)
|
|
if OSX: CPUProgram.rt_lib.pthread_jit_write_protect_np(True)
|
|
|
|
# __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
|
|
# libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
|
|
# it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
|
|
# Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
|
|
CPUProgram.rt_lib["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
|
|
|
|
self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
|
|
|
|
super().__init__(HCQArgsState, dev, name, kernargs_alloc_size=0)
|
|
|
|
def __del__(self):
|
|
if sys.platform == 'win32': ctypes.windll.kernel32.VirtualFree(ctypes.c_void_p(self.mem), ctypes.c_size_t(0), 0x8000) #0x8000 - MEM_RELEASE
|
|
|
|
class CPUAllocator(HCQAllocatorBase):
|
|
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
|
|
if options.external_ptr: buf = (ctypes.c_uint8 * size).from_address(options.external_ptr)
|
|
else:
|
|
offset = round_up(ctypes.addressof(tmpbuf:=(ctypes.c_uint8 * (size + 0x1000))()), 0x1000) - ctypes.addressof(tmpbuf)
|
|
buf = (ctypes.c_uint8 * size).from_buffer(tmpbuf, offset)
|
|
return HCQBuffer(va:=ctypes.addressof(buf), sz:=ctypes.sizeof(buf), meta=buf, view=MMIOInterface(va, sz, fmt='B'), owner=self.dev)
|
|
def _as_buffer(self, src) -> memoryview: return to_mv(src.va_addr, src.size)
|
|
def _as_dmaref(self, buf): return DMACPURef(buf.va_addr, buf.size)
|
|
def _copyin(self, dest, src:memoryview): ctypes.memmove(dest.va_addr, from_mv(src), len(src))
|
|
def _copyout(self, dest:memoryview, src): ctypes.memmove(from_mv(dest), src.va_addr, len(dest))
|
|
def _map(self, buf:HCQBuffer):
|
|
if buf.view is None or not isinstance(buf.view, MMIOInterface): raise RuntimeError("Cannot map buffer without view to cpu")
|
|
|
|
class CPUDevice(HCQCompiled):
|
|
def __init__(self, device:str=""):
|
|
super().__init__(device, CPUAllocator(self), ClangRenderer(), ClangJITCompiler(), functools.partial(CPUProgram, self), HCQSignal, CPUComputeQueue,
|
|
supports_graph=False)
|
|
|