openpilot_comma/tinygrad_repo/tinygrad/runtime/ops_gpu.py

from __future__ import annotations
import platform, functools
import numpy as np
import pyopencl as cl  # type: ignore
from typing import Dict, Optional, List, ClassVar, Final
from collections import defaultdict
from tinygrad.helpers import IMAGE, DEBUG, getenv
from tinygrad.ops import CompiledBuffer, GlobalCounters, RawBufferCopyInOut, RawBuffer
from tinygrad.codegen.gpu import GPUCodegen, GPULanguage

OSX = platform.system() == "Darwin"
OSX_TIMING_RATIO = (125/3) if OSX else 1.0   # see test/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
CLCACHE = getenv("CLCACHE", 1)
FLOAT16 = getenv("FLOAT16", 0)

class _CL:
  @functools.cached_property
  def cl_ctx(self) -> cl.Context:
    devices : List[cl.Device] = sum([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()], [])
    if len(devices) == 0: devices = sum([x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()], []) # settle for CPU
    if len(devices) > 1 or DEBUG >= 1: print(f"using {devices[getenv('CL_DEVICE', 0)]}")
    return cl.Context(devices=[devices[getenv("CL_DEVICE", 0)]])

  @functools.cached_property
  def cl_queue(self) -> cl.CommandQueue:
    return cl.CommandQueue(CL.cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)  # this is an in-order command queue
CL = _CL()

class CLBuffer(RawBufferCopyInOut):
  # TODO: this can be in RawBuffer generically
  BUFFER_CACHE : ClassVar[Dict[int, List[cl.Buffer]]] = defaultdict(list)

  def __init__(self, size):
    self.size = size
    if len(CLBuffer.BUFFER_CACHE[size]) > 0:
      self._cl = CLBuffer.BUFFER_CACHE[size].pop()
    else:
      # TODO: on GPU OOM, clear the cache
      self._cl = cl.Buffer(CL.cl_ctx, cl.mem_flags.READ_WRITE, size)
      GlobalCounters.mem_used += self._cl.size

  def __del__(self):
    if CLCACHE: CLBuffer.BUFFER_CACHE[self._cl.size].append(self._cl)
    else: GlobalCounters.mem_used -= self._cl.size

  def copyin(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, self._cl, x, is_blocking=False)
  def copyout(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, x, self._cl, is_blocking=True)

class CLImage(RawBuffer):  # pylint: disable=abstract-method
  fmt : Final = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT if FLOAT16 else cl.channel_type.FLOAT)
  IMAGE : Final = True

  def __init__(self, shape):
    self._cl = cl.Image(CL.cl_ctx, cl.mem_flags.READ_WRITE, CLImage.fmt, shape=(shape[1], shape[0]))
    GlobalCounters.mem_used += self._cl.row_pitch * self._cl.height

  def __del__(self): GlobalCounters.mem_used -= self._cl.row_pitch * self._cl.height

@functools.lru_cache(maxsize=None)
class CLProgram:
  def __init__(self, name:str, prg:str, binary=False, argdtypes=None):
    self.name, self.argdtypes, self.clprogram = name, argdtypes, cl.Program(CL.cl_ctx, CL.cl_ctx.devices, [prg]) if binary else cl.Program(CL.cl_ctx, prg)  # type: ignore
    try:
      self._clprg = self.clprogram.build()
    except cl.RuntimeError as e:
      if DEBUG >= 3: print("FAILED TO BUILD", prg)
      raise e
    self.clprg = self._clprg.__getattr__(name)
    if DEBUG >= 5 and not OSX:
      binary = self.clprogram.get_info(cl.program_info.BINARIES)[0]
      if 'Adreno' in CL.cl_ctx.devices[0].name:
        from disassemblers.adreno import disasm
        disasm(binary)
      else:
        # print the PTX for NVIDIA. TODO: probably broken for everything else
        print(binary.decode('utf-8'))
    if self.argdtypes is not None: self.clprg.set_scalar_arg_dtypes(self.argdtypes)

  @staticmethod
  def max_work_group_size(): return CL.cl_ctx.devices[0].max_work_group_size

  def __call__(self, global_size, local_size, *bufs, wait=False) -> Optional[float]:
    e = self.clprg(CL.cl_queue, global_size, local_size, *[x._cl if isinstance(x, (CLBuffer, CLImage)) else x for x in bufs])
    if wait:
      CL.cl_queue.finish()
      return ((e.profile.end - e.profile.start) * OSX_TIMING_RATIO) * 1e-9
    return None

class CLCodegen(GPUCodegen):
  lang = GPULanguage(
    kernel_prefix = "__kernel", buffer_prefix = "__global ", smem_prefix = "__local ",
    barrier = "barrier(CLK_LOCAL_MEM_FENCE);", float4 = "(float4)",
    gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)])

class GPUBuffer(CompiledBuffer):
  raw_buffer_type = CLBuffer
  # override this method for image
  @classmethod
  def create_raw_buffer(cls, shape, backing) -> RawBuffer:
    if len(shape) == 3 and shape[2] == 4 and IMAGE >= 2 and backing is None: return CLImage(shape)
    else: return super().create_raw_buffer(shape, backing)
  codegen_type = CLCodegen
  runtime_type = CLProgram
openpilot v0.9.4 release date: 2023-07-27T18:38:32 master commit: fa310d9e2542cf497d92f007baec8fd751ffa99c 2 years ago			`from __future__ import annotations`
			`import platform, functools`
			`import numpy as np`
			`import pyopencl as cl # type: ignore`
			`from typing import Dict, Optional, List, ClassVar, Final`
			`from collections import defaultdict`
			`from tinygrad.helpers import IMAGE, DEBUG, getenv`
			`from tinygrad.ops import CompiledBuffer, GlobalCounters, RawBufferCopyInOut, RawBuffer`
			`from tinygrad.codegen.gpu import GPUCodegen, GPULanguage`

			`OSX = platform.system() == "Darwin"`
			`OSX_TIMING_RATIO = (125/3) if OSX else 1.0 # see test/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something`
			`CLCACHE = getenv("CLCACHE", 1)`
			`FLOAT16 = getenv("FLOAT16", 0)`

			`class _CL:`
			`@functools.cached_property`
			`def cl_ctx(self) -> cl.Context:`
			`devices : List[cl.Device] = sum([x.get_devices(device_type=cl.device_type.GPU) for x in cl.get_platforms()], [])`
			`if len(devices) == 0: devices = sum([x.get_devices(device_type=cl.device_type.CPU) for x in cl.get_platforms()], []) # settle for CPU`
			`if len(devices) > 1 or DEBUG >= 1: print(f"using {devices[getenv('CL_DEVICE', 0)]}")`
			`return cl.Context(devices=[devices[getenv("CL_DEVICE", 0)]])`

			`@functools.cached_property`
			`def cl_queue(self) -> cl.CommandQueue:`
			`return cl.CommandQueue(CL.cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) # this is an in-order command queue`
			`CL = _CL()`

			`class CLBuffer(RawBufferCopyInOut):`
			`# TODO: this can be in RawBuffer generically`
			`BUFFER_CACHE : ClassVar[Dict[int, List[cl.Buffer]]] = defaultdict(list)`

			`def __init__(self, size):`
			`self.size = size`
			`if len(CLBuffer.BUFFER_CACHE[size]) > 0:`
			`self._cl = CLBuffer.BUFFER_CACHE[size].pop()`
			`else:`
			`# TODO: on GPU OOM, clear the cache`
			`self._cl = cl.Buffer(CL.cl_ctx, cl.mem_flags.READ_WRITE, size)`
			`GlobalCounters.mem_used += self._cl.size`

			`def __del__(self):`
			`if CLCACHE: CLBuffer.BUFFER_CACHE[self._cl.size].append(self._cl)`
			`else: GlobalCounters.mem_used -= self._cl.size`

			`def copyin(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, self._cl, x, is_blocking=False)`
			`def copyout(self, x:np.ndarray): cl.enqueue_copy(CL.cl_queue, x, self._cl, is_blocking=True)`

			`class CLImage(RawBuffer): # pylint: disable=abstract-method`
			`fmt : Final = cl.ImageFormat(cl.channel_order.RGBA, cl.channel_type.HALF_FLOAT if FLOAT16 else cl.channel_type.FLOAT)`
			`IMAGE : Final = True`

			`def __init__(self, shape):`
			`self._cl = cl.Image(CL.cl_ctx, cl.mem_flags.READ_WRITE, CLImage.fmt, shape=(shape[1], shape[0]))`
			`GlobalCounters.mem_used += self._cl.row_pitch * self._cl.height`

			`def __del__(self): GlobalCounters.mem_used -= self._cl.row_pitch * self._cl.height`

			`@functools.lru_cache(maxsize=None)`
			`class CLProgram:`
			`def __init__(self, name:str, prg:str, binary=False, argdtypes=None):`
			`self.name, self.argdtypes, self.clprogram = name, argdtypes, cl.Program(CL.cl_ctx, CL.cl_ctx.devices, [prg]) if binary else cl.Program(CL.cl_ctx, prg) # type: ignore`
			`try:`
			`self._clprg = self.clprogram.build()`
			`except cl.RuntimeError as e:`
			`if DEBUG >= 3: print("FAILED TO BUILD", prg)`
			`raise e`
			`self.clprg = self._clprg.__getattr__(name)`
			`if DEBUG >= 5 and not OSX:`
			`binary = self.clprogram.get_info(cl.program_info.BINARIES)[0]`
			`if 'Adreno' in CL.cl_ctx.devices[0].name:`
			`from disassemblers.adreno import disasm`
			`disasm(binary)`
			`else:`
			`# print the PTX for NVIDIA. TODO: probably broken for everything else`
			`print(binary.decode('utf-8'))`
			`if self.argdtypes is not None: self.clprg.set_scalar_arg_dtypes(self.argdtypes)`

			`@staticmethod`
			`def max_work_group_size(): return CL.cl_ctx.devices[0].max_work_group_size`

			`def __call__(self, global_size, local_size, *bufs, wait=False) -> Optional[float]:`
			`e = self.clprg(CL.cl_queue, global_size, local_size, *[x._cl if isinstance(x, (CLBuffer, CLImage)) else x for x in bufs])`
			`if wait:`
			`CL.cl_queue.finish()`
			`return ((e.profile.end - e.profile.start) * OSX_TIMING_RATIO) * 1e-9`
			`return None`

			`class CLCodegen(GPUCodegen):`
			`lang = GPULanguage(`
			`kernel_prefix = "__kernel", buffer_prefix = "__global ", smem_prefix = "__local ",`
			`barrier = "barrier(CLK_LOCAL_MEM_FENCE);", float4 = "(float4)",`
			`gid = [f'get_global_id({i})' for i in range(3)], lid = [f'get_local_id({i})' for i in range(3)])`

			`class GPUBuffer(CompiledBuffer):`
			`raw_buffer_type = CLBuffer`
			`# override this method for image`
			`@classmethod`
			`def create_raw_buffer(cls, shape, backing) -> RawBuffer:`
			`if len(shape) == 3 and shape[2] == 4 and IMAGE >= 2 and backing is None: return CLImage(shape)`
			`else: return super().create_raw_buffer(shape, backing)`
			`codegen_type = CLCodegen`
			`runtime_type = CLProgram`