openpilot is an open source driver assistance system. openpilot performs the functions of Automated Lane Centering and Adaptive Cruise Control for over 200 supported car makes and models.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

104 lines
6.6 KiB

from __future__ import annotations
import platform, subprocess, sys, ctypes, functools, time
from tinygrad.helpers import capstone_flatdump, getenv, from_mv, to_mv, OSX, mv_address, round_up, wait_cond
from tinygrad.device import Compiler, BufferSpec, DMACPURef
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocatorBase, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram, MMIOInterface
from tinygrad.runtime.support.elf import jit_loader
from tinygrad.renderer.cstyle import ClangRenderer
from tinygrad.uop.ops import sint
class ClangJITCompiler(Compiler):
def __init__(self, cachekey="compile_clang_jit"): super().__init__(cachekey)
def compile(self, src:str) -> bytes:
# -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
# x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
target = 'x86_64' if sys.platform == 'win32' else platform.machine()
args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib', '-fno-ident']
arch_args = ['-ffixed-x18'] if target == 'arm64' else []
obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
return jit_loader(obj)
def disassemble(self, lib:bytes): return capstone_flatdump(lib)
class CPUComputeQueue(HWQueue):
def _exec(self, prg, bufs, *args):
prg.fxn(*map(ctypes.c_uint64, args[:bufs]), *map(ctypes.c_int64 if platform.machine() == "arm64" else ctypes.c_int32, args[bufs:]))
def _signal(self, signal_addr, value): to_mv(signal_addr, 4).cast('I')[0] = value
def _wait(self, signal_addr, value): wait_cond(lambda: to_mv(signal_addr, 4).cast('I')[0] >= value, timeout_ms=60000)
def _timestamp(self, timestamp_addr): to_mv(timestamp_addr, 8).cast('Q')[0] = time.perf_counter_ns()
def cmd(self, cmd, *args):
self.q(cmd, len(args), *args)
return self
def memory_barrier(self): return self
def exec(self, prg:CPUProgram, args_state:HCQArgsState, global_size, local_size):
return self.cmd(self._exec, prg, len(args_state.bufs), *[x.va_addr for x in args_state.bufs], *args_state.vals)
def wait(self, signal, value=0): return self.cmd(self._wait, signal.value_addr, value)
def timestamp(self, signal): return self.cmd(self._timestamp, signal.timestamp_addr)
def signal(self, signal, value:sint=0): return self.cmd(self._signal, signal.value_addr, value)
def _submit(self, dev):
# Execute the commands in the queue: fn, argc, args...
off = 0
while off < len(self._q):
self._q[off](*self._q[off + 2:off + 2 + self._q[off + 1]])
off += self._q[off + 1] + 2
# NOTE: MAP_JIT is added to mmap module in python 3.13
MAP_JIT = 0x0800
class CPUProgram(HCQProgram):
rt_lib = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'kernel32') if OSX or sys.platform == "win32" else 'libgcc_s.so.1')
def __init__(self, dev, name:str, lib:bytes):
if sys.platform == "win32":
PAGE_EXECUTE_READWRITE, MEM_COMMIT, MEM_RESERVE = 0x40, 0x1000, 0x2000
ctypes.windll.kernel32.VirtualAlloc.restype = ctypes.c_void_p
self.mem = ctypes.windll.kernel32.VirtualAlloc(ctypes.c_void_p(0), ctypes.c_size_t(len(lib)), MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE)
ctypes.memmove(self.mem, lib, len(lib))
ctypes.windll.kernel32.GetCurrentProcess.restype = ctypes.c_void_p
proc = ctypes.windll.kernel32.GetCurrentProcess()
ctypes.windll.kernel32.FlushInstructionCache(ctypes.c_void_p(proc), ctypes.c_void_p(self.mem), ctypes.c_size_t(len(lib)))
self.fxn = ctypes.CFUNCTYPE(None)(self.mem)
else:
from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
# On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
# MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
if OSX: CPUProgram.rt_lib.pthread_jit_write_protect_np(False)
self.mem.write(lib)
if OSX: CPUProgram.rt_lib.pthread_jit_write_protect_np(True)
# __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
# libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
# it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
# Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
CPUProgram.rt_lib["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
super().__init__(HCQArgsState, dev, name, kernargs_alloc_size=0)
def __del__(self):
if sys.platform == 'win32': ctypes.windll.kernel32.VirtualFree(ctypes.c_void_p(self.mem), ctypes.c_size_t(0), 0x8000) #0x8000 - MEM_RELEASE
class CPUAllocator(HCQAllocatorBase):
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
if options.external_ptr: buf = (ctypes.c_uint8 * size).from_address(options.external_ptr)
else:
offset = round_up(ctypes.addressof(tmpbuf:=(ctypes.c_uint8 * (size + 0x1000))()), 0x1000) - ctypes.addressof(tmpbuf)
buf = (ctypes.c_uint8 * size).from_buffer(tmpbuf, offset)
return HCQBuffer(va:=ctypes.addressof(buf), sz:=ctypes.sizeof(buf), meta=buf, view=MMIOInterface(va, sz, fmt='B'), owner=self.dev)
def _as_buffer(self, src) -> memoryview: return to_mv(src.va_addr, src.size)
def _as_dmaref(self, buf): return DMACPURef(buf.va_addr, buf.size)
def _copyin(self, dest, src:memoryview): ctypes.memmove(dest.va_addr, from_mv(src), len(src))
def _copyout(self, dest:memoryview, src): ctypes.memmove(from_mv(dest), src.va_addr, len(dest))
def _map(self, buf:HCQBuffer):
if buf.view is None or not isinstance(buf.view, MMIOInterface): raise RuntimeError("Cannot map buffer without view to cpu")
class CPUDevice(HCQCompiled):
def __init__(self, device:str=""):
super().__init__(device, CPUAllocator(self), ClangRenderer(), ClangJITCompiler(), functools.partial(CPUProgram, self), HCQSignal, CPUComputeQueue,
supports_graph=False)