from __future__ import annotations
import platform , subprocess , sys , ctypes , functools , time
from tinygrad . helpers import capstone_flatdump , getenv , from_mv , to_mv , OSX , mv_address , round_up , wait_cond
from tinygrad . device import Compiler , BufferSpec , DMACPURef
from tinygrad . runtime . support . hcq import HCQCompiled , HCQAllocatorBase , HCQBuffer , HWQueue , HCQArgsState , HCQSignal , HCQProgram , MMIOInterface
from tinygrad . runtime . support . elf import jit_loader
from tinygrad . renderer . cstyle import ClangRenderer
from tinygrad . uop . ops import sint
class ClangJITCompiler ( Compiler ) :
def __init__ ( self , cachekey = " compile_clang_jit " ) : super ( ) . __init__ ( cachekey )
def compile ( self , src : str ) - > bytes :
# -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
# x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
target = ' x86_64 ' if sys . platform == ' win32 ' else platform . machine ( )
args = [ ' -march=native ' , f ' --target= { target } -none-unknown-elf ' , ' -O2 ' , ' -fPIC ' , ' -ffreestanding ' , ' -fno-math-errno ' , ' -nostdlib ' , ' -fno-ident ' ]
arch_args = [ ' -ffixed-x18 ' ] if target == ' arm64 ' else [ ]
obj = subprocess . check_output ( [ getenv ( " CC " , ' clang ' ) , ' -c ' , ' -x ' , ' c ' , * args , * arch_args , ' - ' , ' -o ' , ' - ' ] , input = src . encode ( ' utf-8 ' ) )
return jit_loader ( obj )
def disassemble ( self , lib : bytes ) : return capstone_flatdump ( lib )
class CPUComputeQueue ( HWQueue ) :
def _exec ( self , prg , bufs , * args ) :
prg . fxn ( * map ( ctypes . c_uint64 , args [ : bufs ] ) , * map ( ctypes . c_int64 if platform . machine ( ) == " arm64 " else ctypes . c_int32 , args [ bufs : ] ) )
def _signal ( self , signal_addr , value ) : to_mv ( signal_addr , 4 ) . cast ( ' I ' ) [ 0 ] = value
def _wait ( self , signal_addr , value ) : wait_cond ( lambda : to_mv ( signal_addr , 4 ) . cast ( ' I ' ) [ 0 ] > = value , timeout_ms = 60000 )
def _timestamp ( self , timestamp_addr ) : to_mv ( timestamp_addr , 8 ) . cast ( ' Q ' ) [ 0 ] = time . perf_counter_ns ( )
def cmd ( self , cmd , * args ) :
self . q ( cmd , len ( args ) , * args )
return self
def memory_barrier ( self ) : return self
def exec ( self , prg : CPUProgram , args_state : HCQArgsState , global_size , local_size ) :
return self . cmd ( self . _exec , prg , len ( args_state . bufs ) , * [ x . va_addr for x in args_state . bufs ] , * args_state . vals )
def wait ( self , signal , value = 0 ) : return self . cmd ( self . _wait , signal . value_addr , value )
def timestamp ( self , signal ) : return self . cmd ( self . _timestamp , signal . timestamp_addr )
def signal ( self , signal , value : sint = 0 ) : return self . cmd ( self . _signal , signal . value_addr , value )
def _submit ( self , dev ) :
# Execute the commands in the queue: fn, argc, args...
off = 0
while off < len ( self . _q ) :
self . _q [ off ] ( * self . _q [ off + 2 : off + 2 + self . _q [ off + 1 ] ] )
off + = self . _q [ off + 1 ] + 2
# NOTE: MAP_JIT is added to mmap module in python 3.13
MAP_JIT = 0x0800
class CPUProgram ( HCQProgram ) :
rt_lib = ctypes . CDLL ( ctypes . util . find_library ( ' System ' if OSX else ' kernel32 ' ) if OSX or sys . platform == " win32 " else ' libgcc_s.so.1 ' )
def __init__ ( self , dev , name : str , lib : bytes ) :
if sys . platform == " win32 " :
PAGE_EXECUTE_READWRITE , MEM_COMMIT , MEM_RESERVE = 0x40 , 0x1000 , 0x2000
ctypes . windll . kernel32 . VirtualAlloc . restype = ctypes . c_void_p
self . mem = ctypes . windll . kernel32 . VirtualAlloc ( ctypes . c_void_p ( 0 ) , ctypes . c_size_t ( len ( lib ) ) , MEM_COMMIT | MEM_RESERVE , PAGE_EXECUTE_READWRITE )
ctypes . memmove ( self . mem , lib , len ( lib ) )
ctypes . windll . kernel32 . GetCurrentProcess . restype = ctypes . c_void_p
proc = ctypes . windll . kernel32 . GetCurrentProcess ( )
ctypes . windll . kernel32 . FlushInstructionCache ( ctypes . c_void_p ( proc ) , ctypes . c_void_p ( self . mem ) , ctypes . c_size_t ( len ( lib ) ) )
self . fxn = ctypes . CFUNCTYPE ( None ) ( self . mem )
else :
from mmap import mmap , PROT_READ , PROT_WRITE , PROT_EXEC , MAP_ANON , MAP_PRIVATE
# On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
# MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
self . mem = mmap ( - 1 , len ( lib ) , MAP_ANON | MAP_PRIVATE | ( MAP_JIT if OSX else 0 ) , PROT_READ | PROT_WRITE | PROT_EXEC )
if OSX : CPUProgram . rt_lib . pthread_jit_write_protect_np ( False )
self . mem . write ( lib )
if OSX : CPUProgram . rt_lib . pthread_jit_write_protect_np ( True )
# __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
# libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
# it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
# Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
CPUProgram . rt_lib [ " __clear_cache " ] ( ctypes . c_void_p ( mv_address ( self . mem ) ) , ctypes . c_void_p ( mv_address ( self . mem ) + len ( lib ) ) )
self . fxn = ctypes . CFUNCTYPE ( None ) ( mv_address ( self . mem ) )
super ( ) . __init__ ( HCQArgsState , dev , name , kernargs_alloc_size = 0 )
def __del__ ( self ) :
if sys . platform == ' win32 ' : ctypes . windll . kernel32 . VirtualFree ( ctypes . c_void_p ( self . mem ) , ctypes . c_size_t ( 0 ) , 0x8000 ) #0x8000 - MEM_RELEASE
class CPUAllocator ( HCQAllocatorBase ) :
def _alloc ( self , size : int , options : BufferSpec ) - > HCQBuffer :
if options . external_ptr : buf = ( ctypes . c_uint8 * size ) . from_address ( options . external_ptr )
else :
offset = round_up ( ctypes . addressof ( tmpbuf := ( ctypes . c_uint8 * ( size + 0x1000 ) ) ( ) ) , 0x1000 ) - ctypes . addressof ( tmpbuf )
buf = ( ctypes . c_uint8 * size ) . from_buffer ( tmpbuf , offset )
return HCQBuffer ( va := ctypes . addressof ( buf ) , sz := ctypes . sizeof ( buf ) , meta = buf , view = MMIOInterface ( va , sz , fmt = ' B ' ) , owner = self . dev )
def _as_buffer ( self , src ) - > memoryview : return to_mv ( src . va_addr , src . size )
def _as_dmaref ( self , buf ) : return DMACPURef ( buf . va_addr , buf . size )
def _copyin ( self , dest , src : memoryview ) : ctypes . memmove ( dest . va_addr , from_mv ( src ) , len ( src ) )
def _copyout ( self , dest : memoryview , src ) : ctypes . memmove ( from_mv ( dest ) , src . va_addr , len ( dest ) )
def _map ( self , buf : HCQBuffer ) :
if buf . view is None or not isinstance ( buf . view , MMIOInterface ) : raise RuntimeError ( " Cannot map buffer without view to cpu " )
class CPUDevice ( HCQCompiled ) :
def __init__ ( self , device : str = " " ) :
super ( ) . __init__ ( device , CPUAllocator ( self ) , ClangRenderer ( ) , ClangJITCompiler ( ) , functools . partial ( CPUProgram , self ) , HCQSignal , CPUComputeQueue ,
supports_graph = False )