from __future__ import annotations
import os
os . environ [ ' PYOPENCL_NO_CACHE ' ] = ' 1 '
import pathlib
import numpy as np
import pyopencl as cl # type: ignore
from typing import Optional , List , Tuple
from tinygrad . helpers import DEBUG , getenv , prod , ImageDType , OSX , fromimport , diskcache
from tinygrad . ops import Compiled
from tinygrad . renderer . opencl import OpenCLRenderer
from tinygrad . runtime . lib import RawBufferCopyInOut , LRUAllocator , RawBufferTransfer
from tinygrad . codegen . kernel import LinearizerOptions
OSX_TIMING_RATIO = ( 125 / 3 ) if OSX else 1.0 # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
# TODO: if you fork and exit the child process after creating anything with cl on AMD, it hangs on e.wait()
ROCM_LLVM_PATH = pathlib . Path ( " /opt/rocm/llvm/bin " )
#ROCM_LLVM_PATH = pathlib.Path(__file__).parents[3] / "extra/rocm/build/llvm-project/bin"
if DEBUG > = 5 :
early_exec = fromimport ( " extra.helpers " , " enable_early_exec " ) ( )
class CLAllocator ( LRUAllocator ) :
def _do_alloc ( self , size , dtype , device , * * kwargs ) :
if isinstance ( dtype , ImageDType ) :
# NOTE: the memory is a bit off here due to padding, it's buf.row_pitch * buf.height * 4 * dtype.itemsize
assert size == prod ( dtype . shape ) , f " image size mismatch { size } != { dtype . shape } "
fmt = cl . ImageFormat ( cl . channel_order . RGBA , { 2 : cl . channel_type . HALF_FLOAT , 4 : cl . channel_type . FLOAT } [ dtype . itemsize ] )
buf = cl . Image ( CL . cl_ctxs [ int ( device ) ] , cl . mem_flags . READ_WRITE , fmt , shape = ( dtype . shape [ 1 ] , dtype . shape [ 0 ] ) )
else :
buf = cl . Buffer ( CL . cl_ctxs [ int ( device ) ] , cl . mem_flags . READ_WRITE , size * dtype . itemsize )
setattr ( buf , ' device ' , int ( device ) ) # device is tracked on the underlying buffer
return buf
class _CL :
def __init__ ( self ) :
cl_platforms = cl . get_platforms ( )
platform_devices : List [ List [ cl . Device ] ] = [ y for y in ( [ x . get_devices ( device_type = cl . device_type . GPU ) for x in cl_platforms ] + [ x . get_devices ( device_type = cl . device_type . CPU ) for x in cl_platforms ] ) if y ]
self . devices = [ device for device in platform_devices [ getenv ( ' CL_PLATFORM ' , 0 ) ] if device . name not in getenv ( ' CL_EXCLUDE ' , " " ) . split ( " , " ) ]
self . cl_platform = self . devices [ 0 ] . platform
def post_init ( self , device = None ) :
self . cl_ctxs : List [ cl . Context ] = [ cl . Context ( devices = [ x ] ) for x in self . devices ] if device is None else [ cl . Context ( devices = [ self . devices [ device ] ] ) ]
if DEBUG > = 1 : print ( f " using devices: { [ ctx . devices [ 0 ] . hashable_model_and_version_identifier for ctx in self . cl_ctxs ] } " )
self . cl_queue : List [ cl . CommandQueue ] = [ cl . CommandQueue ( ctx , device = ctx . devices [ 0 ] , properties = cl . command_queue_properties . PROFILING_ENABLE ) for ctx in self . cl_ctxs ]
self . cl_allocator = CLAllocator ( CL . cl_ctxs [ 0 ] . devices [ 0 ] . get_info ( cl . device_info . GLOBAL_MEM_SIZE ) )
def synchronize ( self ) :
for q in self . cl_queue : q . finish ( )
CL = _CL ( )
if not getenv ( " DELAYED_RUNTIME_INIT " , False ) : CL . post_init ( )
class CLBuffer ( RawBufferCopyInOut , RawBufferTransfer ) :
def __init__ ( self , size , dtype , device = ' 0 ' ) : super ( ) . __init__ ( size , dtype , allocator = CL . cl_allocator , * * { ' device ' : device } )
def _copyin ( self , x : np . ndarray ) :
assert not self . dtype . name . startswith ( " image " ) , f " can ' t copyin images { self . dtype } "
self . event = cl . enqueue_copy ( CL . cl_queue [ self . _buf . device ] , self . _buf , np . require ( x , requirements = [ ' C ' , ' A ' ] ) , is_blocking = False )
def _copyout ( self , x : np . ndarray ) :
assert not self . dtype . name . startswith ( " image " ) , f " can ' t copyout images { self . dtype } "
CL . cl_allocator . ensure_has_free_space ( self . size , self . dtype , self . _device )
buf = cl . Buffer ( CL . cl_ctxs [ self . _buf . device ] , cl . mem_flags . WRITE_ONLY | cl . mem_flags . USE_HOST_PTR , 0 , hostbuf = x . data )
mapped , event = cl . enqueue_map_buffer ( CL . cl_queue [ self . _buf . device ] , buf , cl . map_flags . WRITE , 0 , self . size , dtype = self . dtype . np , is_blocking = False )
with mapped . base : cl . enqueue_copy ( CL . cl_queue [ self . _buf . device ] , mapped , self . _buf , is_blocking = True , wait_for = [ event ] + ( [ self . event ] if hasattr ( self , " event " ) else [ ] ) )
def _transfer ( self , x ) :
if " gfx " in CL . cl_ctxs [ x . _buf . device ] . devices [ 0 ] . name :
cl . enqueue_copy_buffer_p2p_amd ( CL . cl_platform , CL . cl_queue [ x . _buf . device ] , x . _buf , self . _buf , x . size * x . dtype . itemsize ) . wait ( )
else : raise NotImplementedError ( " p2p transfer between devices not implemented on non-amd " )
@diskcache
def compile_gpu ( prg : str ) - > bytes :
clprg = cl . Program ( CL . cl_ctxs [ 0 ] , prg )
clprg . build ( )
return clprg . get_info ( cl . program_info . BINARIES ) [ 0 ]
class CLProgram :
def __init__ ( self , name : str , prg : bytes , argdtypes = None , options = None ) :
self . name , self . clprograms = name , [ cl . Program ( ctx , ctx . devices , [ prg ] * len ( ctx . devices ) ) for ctx in CL . cl_ctxs ] # type: ignore
self . _clprgs = [ clprogram . build ( options = options ) for clprogram in self . clprograms ]
self . clprgs = [ clprg . __getattr__ ( name ) for clprg in self . _clprgs ]
if DEBUG > = 5 and not OSX :
if ' Adreno ' in CL . cl_ctxs [ 0 ] . devices [ 0 ] . name :
fromimport ( ' disassemblers.adreno ' , ' disasm ' ) ( prg )
elif CL . cl_ctxs [ 0 ] . devices [ 0 ] . name . startswith ( ' gfx ' ) :
asm = early_exec ( ( [ ROCM_LLVM_PATH / " llvm-objdump " , ' -d ' , ' - ' ] , prg ) )
print ( ' \n ' . join ( [ x for x in asm . decode ( ' utf-8 ' ) . split ( " \n " ) if ' s_code_end ' not in x ] ) )
else :
# print the PTX for NVIDIA. TODO: probably broken for everything else
print ( prg . decode ( ' utf-8 ' ) )
if argdtypes is not None : self . set_argdtypes ( argdtypes )
def set_argdtypes ( self , argdtypes ) : self . argdtypes , _ = argdtypes , [ clprg . set_scalar_arg_dtypes ( argdtypes ) for clprg in self . clprgs ]
@staticmethod
def max_work_group_size ( ) : return CL . cl_ctxs [ 0 ] . devices [ 0 ] . max_work_group_size
def __call__ ( self , * bufs , global_size : Tuple [ int , int , int ] , local_size : Optional [ Tuple [ int , int , int ] ] = None , wait = False ) - > Optional [ float ] :
if not hasattr ( self , ' argdtypes ' ) : self . set_argdtypes ( tuple ( None if x . __class__ is CLBuffer else np . int32 for x in bufs ) )
cl_bufs , wait_for = [ ] , [ ]
for x in bufs :
if x . __class__ is CLBuffer :
cl_bufs . append ( x . _buf )
if hasattr ( x , " event " ) : wait_for . append ( x . event )
else : cl_bufs . append ( x )
e = self . clprgs [ cl_bufs [ 0 ] . device ] ( CL . cl_queue [ cl_bufs [ 0 ] . device ] , [ int ( g * l ) for g , l in zip ( global_size , local_size ) ] if local_size is not None else global_size , local_size , * cl_bufs , wait_for = wait_for )
if wait :
e . wait ( )
try :
return ( ( e . profile . end - e . profile . start ) * OSX_TIMING_RATIO ) * 1e-9
except cl . RuntimeError : # no profiling info available
return None
return None
GPUBuffer = Compiled ( CLBuffer , LinearizerOptions ( ) , OpenCLRenderer , compile_gpu , CLProgram , CL . synchronize )