You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
103 lines
4.6 KiB
103 lines
4.6 KiB
from __future__ import annotations
|
|
import platform, functools
|
|
import numpy as np
|
|
import pyopencl as cl # type: ignore
|
|
from typing import Dict, Optional, List, ClassVar, Final
|
|
from collections import defaultdict
|
|
from tinygrad.helpers import IMAGE, DEBUG, getenv
|
|
from tinygrad.ops import CompiledBuffer, GlobalCounters, RawBufferCopyInOut, RawBuffer
|
|
from tinygrad.codegen.gpu import GPUCodegen, GPULanguage
|
|
|
|
OSX = platform.system() == "Darwin"
|
|
OSX_TIMING_RATIO = (125/3) if OSX else 1.0 # see test/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
|
|
CLCACHE = getenv("CLCACHE", 1)
|
|
FLOAT16 = getenv("FLOAT16", 0)
|
|
|
|
class _CL:
|
|
@functools.cached_property
|
|
def cl_ctx(self) -> cl.Context:
|
|
devices : List[cl.Device] = sum([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()], [])
|
|
if len(devices) == 0: devices = sum([x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()], []) # settle for CPU
|
|
if len(devices) > 1 or DEBUG >= 1: print(f"using {devices[getenv('CL_DEVICE', 0)]}")
|
|
return cl.Context(devices=[devices[getenv("CL_DEVICE", 0)]])
|
|
|
|
@functools.cached_property
|
|
def cl_queue(self) -> cl.CommandQueue:
|
|
return cl.CommandQueue(CL.cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # this is an in-order command queue
|
|
CL = _CL()
|
|
|
|
class CLBuffer(RawBufferCopyInOut):
|
|
# TODO: this can be in RawBuffer generically
|
|
BUFFER_CACHE : ClassVar[Dict[int, List[cl.Buffer]]] = defaultdict(list)
|
|
|
|
def __init__(self, size):
|
|
self.size = size
|
|
if len(CLBuffer.BUFFER_CACHE[size]) > 0:
|
|
self._cl = CLBuffer.BUFFER_CACHE[size].pop()
|
|
else:
|
|
# TODO: on GPU OOM, clear the cache
|
|
self._cl = cl.Buffer(CL.cl_ctx, cl.mem_flags.READ_WRITE, size)
|
|
GlobalCounters.mem_used += self._cl.size
|
|
|
|
def __del__(self):
|
|
if CLCACHE: CLBuffer.BUFFER_CACHE[self._cl.size].append(self._cl)
|
|
else: GlobalCounters.mem_used -= self._cl.size
|
|
|
|
def copyin(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, self._cl, x, is_blocking=False)
|
|
def copyout(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, x, self._cl, is_blocking=True)
|
|
|
|
class CLImage(RawBuffer): # pylint: disable=abstract-method
|
|
fmt : Final = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT if FLOAT16 else cl.channel_type.FLOAT)
|
|
IMAGE : Final = True
|
|
|
|
def __init__(self, shape):
|
|
self._cl = cl.Image(CL.cl_ctx, cl.mem_flags.READ_WRITE, CLImage.fmt, shape=(shape[1], shape[0]))
|
|
GlobalCounters.mem_used += self._cl.row_pitch * self._cl.height
|
|
|
|
def __del__(self): GlobalCounters.mem_used -= self._cl.row_pitch * self._cl.height
|
|
|
|
@functools.lru_cache(maxsize=None)
|
|
class CLProgram:
|
|
def __init__(self, name:str, prg:str, binary=False, argdtypes=None):
|
|
self.name, self.argdtypes, self.clprogram = name, argdtypes, cl.Program(CL.cl_ctx, CL.cl_ctx.devices, [prg]) if binary else cl.Program(CL.cl_ctx, prg) # type: ignore
|
|
try:
|
|
self._clprg = self.clprogram.build()
|
|
except cl.RuntimeError as e:
|
|
if DEBUG >= 3: print("FAILED TO BUILD", prg)
|
|
raise e
|
|
self.clprg = self._clprg.__getattr__(name)
|
|
if DEBUG >= 5 and not OSX:
|
|
binary = self.clprogram.get_info(cl.program_info.BINARIES)[0]
|
|
if 'Adreno' in CL.cl_ctx.devices[0].name:
|
|
from disassemblers.adreno import disasm
|
|
disasm(binary)
|
|
else:
|
|
# print the PTX for NVIDIA. TODO: probably broken for everything else
|
|
print(binary.decode('utf-8'))
|
|
if self.argdtypes is not None: self.clprg.set_scalar_arg_dtypes(self.argdtypes)
|
|
|
|
@staticmethod
|
|
def max_work_group_size(): return CL.cl_ctx.devices[0].max_work_group_size
|
|
|
|
def __call__(self, global_size, local_size, *bufs, wait=False) -> Optional[float]:
|
|
e = self.clprg(CL.cl_queue, global_size, local_size, *[x._cl if isinstance(x, (CLBuffer, CLImage)) else x for x in bufs])
|
|
if wait:
|
|
CL.cl_queue.finish()
|
|
return ((e.profile.end - e.profile.start) * OSX_TIMING_RATIO) * 1e-9
|
|
return None
|
|
|
|
class CLCodegen(GPUCodegen):
|
|
lang = GPULanguage(
|
|
kernel_prefix = "__kernel", buffer_prefix = "__global ", smem_prefix = "__local ",
|
|
barrier = "barrier(CLK_LOCAL_MEM_FENCE);", float4 = "(float4)",
|
|
gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)])
|
|
|
|
class GPUBuffer(CompiledBuffer):
|
|
raw_buffer_type = CLBuffer
|
|
# override this method for image
|
|
@classmethod
|
|
def create_raw_buffer(cls, shape, backing) -> RawBuffer:
|
|
if len(shape) == 3 and shape[2] == 4 and IMAGE >= 2 and backing is None: return CLImage(shape)
|
|
else: return super().create_raw_buffer(shape, backing)
|
|
codegen_type = CLCodegen
|
|
runtime_type = CLProgram
|
|
|