You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
374 lines
22 KiB
374 lines
22 KiB
from __future__ import annotations
|
|
import os, ctypes, functools, mmap, struct, array, math, sys
|
|
assert sys.platform != 'win32'
|
|
from types import SimpleNamespace
|
|
from typing import Any, cast, ClassVar
|
|
from tinygrad.device import BufferSpec
|
|
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
|
|
from tinygrad.runtime.support.hcq import HWInterface
|
|
from tinygrad.runtime.autogen import kgsl, adreno
|
|
from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice
|
|
from tinygrad.renderer.cstyle import QCOMRenderer
|
|
from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, prod, fromimport
|
|
if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import
|
|
|
|
BUFTYPE_BUF, BUFTYPE_TEX, BUFTYPE_IBO = 0, 1, 2
|
|
|
|
#Parse C-style defines: <regname>_<field_x>__SHIFT and <regname>_<field_y>__MASK from the adreno module into the following format:
|
|
# qreg.<regname>(<field_x>=..., <field_y>=..., ..., <field_n>=...)
|
|
def _qreg_exec(reg, __val=0, **kwargs):
|
|
for k, v in kwargs.items():
|
|
__val |= (getattr(adreno, f'{reg[4:]}_{k.upper()}') if v else 0) if type(v) is bool else (v << getattr(adreno, f'{reg[4:]}_{k.upper()}__SHIFT'))
|
|
return __val
|
|
qreg: Any = type("QREG", (object,), {name[4:].lower(): functools.partial(_qreg_exec, name) for name in adreno.__dict__.keys() if name[:4] == 'REG_'})
|
|
|
|
def next_power2(x): return 1 if x == 0 else 1 << (x - 1).bit_length()
|
|
|
|
def parity(val: int):
|
|
for i in range(4,1,-1): val ^= val >> (1 << i)
|
|
return (~0x6996 >> (val & 0xf)) & 1
|
|
|
|
def pkt7_hdr(opcode: int, cnt: int): return adreno.CP_TYPE7_PKT | cnt & 0x3FFF | parity(cnt) << 15 | (opcode & 0x7F) << 16 | parity(opcode) << 23
|
|
|
|
def pkt4_hdr(reg: int, cnt: int): return adreno.CP_TYPE4_PKT | cnt & 0x7F | parity(cnt) << 7 | (reg & 0x3FFFF) << 8 | parity(reg) << 27
|
|
|
|
class QCOMCompiler(CLCompiler):
|
|
def __init__(self, device:str=""): super().__init__(CLDevice(device), 'compile_qcom')
|
|
def disassemble(self, lib:bytes): fromimport('extra.disassemblers.adreno', 'disasm')(lib)
|
|
|
|
class QCOMSignal(HCQSignal):
|
|
def __init__(self, base_addr:int|None=None, **kwargs):
|
|
super().__init__(base_addr, **kwargs, timestamp_divider=19.2, dev_t=QCOMDevice)
|
|
|
|
def _sleep(self, time_spent_waiting_ms:int):
|
|
# Sleep only for only timeline signals. Do it immediately to free cpu.
|
|
if self.timeline_for_device is not None:
|
|
kgsl.IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID(self.timeline_for_device.fd, context_id=self.timeline_for_device.ctx,
|
|
timestamp=self.timeline_for_device.last_cmd, timeout=0xffffffff)
|
|
|
|
class QCOMComputeQueue(HWQueue):
|
|
def __del__(self):
|
|
if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True))
|
|
|
|
def cmd(self, opcode: int, *vals: int): self.q(pkt7_hdr(opcode, len(vals)), *vals)
|
|
|
|
def reg(self, reg: int, *vals: int): self.q(pkt4_hdr(reg, len(vals)), *vals)
|
|
|
|
def _cache_flush(self, write_back=True, invalidate=False, sync=True, memsync=False):
|
|
# TODO: 7xx support.
|
|
if write_back: self.cmd(adreno.CP_EVENT_WRITE, adreno.CACHE_FLUSH_TS, *data64_le(QCOMDevice.dummy_addr), 0) # dirty cache write-back.
|
|
if invalidate: self.cmd(adreno.CP_EVENT_WRITE, adreno.CACHE_INVALIDATE) # invalidate cache lines (following reads from RAM).
|
|
if memsync: self.cmd(adreno.CP_WAIT_MEM_WRITES)
|
|
if sync: self.cmd(adreno.CP_WAIT_FOR_IDLE)
|
|
|
|
def memory_barrier(self):
|
|
self._cache_flush(write_back=True, invalidate=True, sync=True, memsync=True)
|
|
return self
|
|
|
|
def signal(self, signal:QCOMSignal, value=0, ts=False):
|
|
self.cmd(adreno.CP_WAIT_FOR_IDLE)
|
|
if QCOMDevice.gpu_id < 700:
|
|
self.cmd(adreno.CP_EVENT_WRITE, qreg.cp_event_write_0(event=adreno.CACHE_FLUSH_TS, timestamp=ts),
|
|
*data64_le(signal.timestamp_addr if ts else signal.value_addr), qreg.cp_event_write_3(value & 0xFFFFFFFF))
|
|
self._cache_flush(write_back=True, invalidate=False, sync=False, memsync=False)
|
|
else:
|
|
# TODO: support devices starting with 8 Gen 1. Also, 700th series have convenient CP_GLOBAL_TIMESTAMP and CP_LOCAL_TIMESTAMP
|
|
raise RuntimeError('CP_EVENT_WRITE7 is not supported')
|
|
return self
|
|
|
|
def timestamp(self, signal:QCOMSignal): return self.signal(signal, 0, ts=True)
|
|
|
|
def wait(self, signal:QCOMSignal, value=0):
|
|
self.cmd(adreno.CP_WAIT_REG_MEM, qreg.cp_wait_reg_mem_0(function=adreno.WRITE_GE, poll=adreno.POLL_MEMORY),*data64_le(signal.value_addr),
|
|
qreg.cp_wait_reg_mem_3(ref=value&0xFFFFFFFF), qreg.cp_wait_reg_mem_4(mask=0xFFFFFFFF), qreg.cp_wait_reg_mem_5(delay_loop_cycles=32))
|
|
return self
|
|
|
|
def _build_gpu_command(self, dev:QCOMDevice, hw_addr=None):
|
|
to_mv((hw_page_addr:=hw_addr or dev.cmd_buf_allocator.alloc(len(self._q) * 4)), len(self._q) * 4).cast('I')[:] = array.array('I', self._q)
|
|
obj = kgsl.struct_kgsl_command_object(gpuaddr=hw_page_addr, size=len(self._q) * 4, flags=kgsl.KGSL_CMDLIST_IB)
|
|
submit_req = kgsl.struct_kgsl_gpu_command(cmdlist=ctypes.addressof(obj), numcmds=1, context_id=dev.ctx,
|
|
cmdsize=ctypes.sizeof(kgsl.struct_kgsl_command_object))
|
|
return submit_req, obj
|
|
|
|
def bind(self, dev:QCOMDevice):
|
|
self.binded_device = dev
|
|
self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True))
|
|
self.submit_req, self.obj = self._build_gpu_command(self.binded_device, self.hw_page.va_addr)
|
|
# From now on, the queue is on the device for faster submission.
|
|
self._q = to_mv(self.obj.gpuaddr, len(self._q) * 4).cast("I")
|
|
|
|
def _submit(self, dev:QCOMDevice):
|
|
if self.binded_device == dev: submit_req = self.submit_req
|
|
else: submit_req, _ = self._build_gpu_command(dev)
|
|
dev.last_cmd = kgsl.IOCTL_KGSL_GPU_COMMAND(dev.fd, __payload=submit_req).timestamp
|
|
|
|
def exec(self, prg:QCOMProgram, args_state:QCOMArgsState, global_size, local_size):
|
|
self.bind_args_state(args_state)
|
|
|
|
def cast_int(x, ceil=False): return (math.ceil(x) if ceil else int(x)) if isinstance(x, float) else x
|
|
global_size_mp = [cast_int(g*l) for g,l in zip(global_size, local_size)]
|
|
|
|
self.cmd(adreno.CP_SET_MARKER, qreg.a6xx_cp_set_marker_0(mode=adreno.RM6_COMPUTE))
|
|
self.reg(adreno.REG_A6XX_HLSQ_INVALIDATE_CMD, qreg.a6xx_hlsq_invalidate_cmd(cs_state=True, cs_ibo=True))
|
|
self.reg(adreno.REG_A6XX_HLSQ_INVALIDATE_CMD, 0x0)
|
|
self.reg(adreno.REG_A6XX_SP_CS_TEX_COUNT, qreg.a6xx_sp_cs_tex_count(0x80))
|
|
self.reg(adreno.REG_A6XX_SP_CS_IBO_COUNT, qreg.a6xx_sp_cs_ibo_count(0x40))
|
|
self.reg(adreno.REG_A6XX_SP_MODE_CONTROL, qreg.a6xx_sp_mode_control(isammode=adreno.ISAMMODE_CL))
|
|
self.reg(adreno.REG_A6XX_SP_PERFCTR_ENABLE, qreg.a6xx_sp_perfctr_enable(cs=True))
|
|
self.reg(adreno.REG_A6XX_SP_TP_MODE_CNTL, qreg.a6xx_sp_tp_mode_cntl(isammode=adreno.ISAMMODE_CL, unk3=2))
|
|
self.reg(adreno.REG_A6XX_TPL1_DBG_ECO_CNTL, 0)
|
|
self.cmd(adreno.CP_WAIT_FOR_IDLE)
|
|
|
|
self.reg(adreno.REG_A6XX_HLSQ_CS_NDRANGE_0,
|
|
qreg.a6xx_hlsq_cs_ndrange_0(kerneldim=3, localsizex=local_size[0] - 1, localsizey=local_size[1] - 1, localsizez=local_size[2] - 1),
|
|
global_size_mp[0], 0, global_size_mp[1], 0, global_size_mp[2], 0, 0xccc0cf, 0xfc | qreg.a6xx_hlsq_cs_cntl_1(threadsize=adreno.THREAD64),
|
|
cast_int(global_size[0], ceil=True), cast_int(global_size[1], ceil=True), cast_int(global_size[2], ceil=True))
|
|
|
|
self.reg(adreno.REG_A6XX_SP_CS_CTRL_REG0,
|
|
qreg.a6xx_sp_cs_ctrl_reg0(threadsize=adreno.THREAD64, halfregfootprint=prg.hregs, fullregfootprint=prg.fregs, branchstack=prg.brnchstck),
|
|
qreg.a6xx_sp_cs_unknown_a9b1(unk6=True, shared_size=prg.shared_size), 0, prg.prg_offset, *data64_le(prg.lib_gpu.va_addr),
|
|
qreg.a6xx_sp_cs_pvt_mem_param(memsizeperitem=prg.pvtmem_size_per_item), *data64_le(prg.dev._stack.va_addr),
|
|
qreg.a6xx_sp_cs_pvt_mem_size(totalpvtmemsize=prg.pvtmem_size_total))
|
|
|
|
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT,
|
|
state_block=adreno.SB6_CS_SHADER, num_unit=1024 // 4),
|
|
*data64_le(args_state.ptr))
|
|
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_SHADER, state_src=adreno.SS6_INDIRECT,
|
|
state_block=adreno.SB6_CS_SHADER, num_unit=round_up(prg.image_size, 128) // 128),
|
|
*data64_le(prg.lib_gpu.va_addr))
|
|
|
|
self.reg(adreno.REG_A6XX_HLSQ_CONTROL_2_REG, 0xfcfcfcfc, 0xfcfcfcfc, 0xfcfcfcfc, 0xfc, qreg.a6xx_hlsq_cs_cntl(constlen=1024 // 4, enabled=True))
|
|
|
|
self.reg(adreno.REG_A6XX_SP_CS_PVT_MEM_HW_STACK_OFFSET, qreg.a6xx_sp_cs_pvt_mem_hw_stack_offset(prg.hw_stack_offset))
|
|
self.reg(adreno.REG_A6XX_SP_CS_INSTRLEN, qreg.a6xx_sp_cs_instrlen(prg.image_size // 4))
|
|
|
|
if args_state.prg.samp_cnt > 0:
|
|
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_SHADER, state_src=adreno.SS6_INDIRECT,
|
|
state_block=adreno.SB6_CS_TEX, num_unit=args_state.prg.samp_cnt),
|
|
*data64_le(args_state.ptr + args_state.prg.samp_off))
|
|
self.reg(adreno.REG_A6XX_SP_CS_TEX_SAMP, *data64_le(args_state.ptr + args_state.prg.samp_off))
|
|
self.reg(adreno.REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, *data64_le(prg.dev.border_color_buf.va_addr))
|
|
|
|
if args_state.prg.tex_cnt > 0:
|
|
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT,
|
|
state_block=adreno.SB6_CS_TEX, num_unit=min(16, args_state.prg.tex_cnt)),
|
|
*data64_le(args_state.ptr + args_state.prg.tex_off))
|
|
self.reg(adreno.REG_A6XX_SP_CS_TEX_CONST, *data64_le(args_state.ptr + args_state.prg.tex_off))
|
|
|
|
if args_state.prg.ibo_cnt > 0:
|
|
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST6_IBO, state_src=adreno.SS6_INDIRECT,
|
|
state_block=adreno.SB6_CS_SHADER, num_unit=args_state.prg.ibo_cnt),
|
|
*data64_le(args_state.ptr + args_state.prg.ibo_off))
|
|
self.reg(adreno.REG_A6XX_SP_CS_IBO, *data64_le(args_state.ptr + args_state.prg.ibo_off))
|
|
|
|
self.reg(adreno.REG_A6XX_SP_CS_CONFIG,
|
|
qreg.a6xx_sp_cs_config(enabled=True, nsamp=args_state.prg.samp_cnt, ntex=args_state.prg.tex_cnt, nibo=args_state.prg.ibo_cnt))
|
|
self.cmd(adreno.CP_RUN_OPENCL, 0)
|
|
self._cache_flush(write_back=True, invalidate=False, sync=False, memsync=False)
|
|
return self
|
|
|
|
class QCOMArgsState(HCQArgsState):
|
|
def __init__(self, ptr:int, prg:QCOMProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
|
|
super().__init__(ptr, prg, bufs, vals=vals)
|
|
|
|
if len(bufs) + len(vals) != len(prg.buf_info): raise RuntimeError(f'incorrect args size given={len(bufs)+len(vals)} != want={len(prg.buf_info)}')
|
|
|
|
self.buf_info, self.args_info, self.args_view = prg.buf_info[:len(bufs)], prg.buf_info[len(bufs):], to_mv(ptr, prg.kernargs_alloc_size).cast('Q')
|
|
|
|
ctypes.memset(self.ptr, 0, prg.kernargs_alloc_size)
|
|
for cnst_val, cnst_off, cnst_sz in prg.consts_info: to_mv(self.ptr + cnst_off, cnst_sz)[:] = cnst_val.to_bytes(cnst_sz, byteorder='little')
|
|
|
|
if prg.samp_cnt > 0: to_mv(self.ptr + prg.samp_off, len(prg.samplers) * 4).cast('I')[:] = array.array('I', prg.samplers)
|
|
for i, b in enumerate(bufs):
|
|
if prg.buf_info[i].type in {BUFTYPE_TEX, BUFTYPE_IBO}:
|
|
obj = b.texture_info.desc if prg.buf_info[i].type is BUFTYPE_TEX else b.texture_info.ibo
|
|
to_mv(self.ptr + prg.buf_info[i].offset, len(obj) * 4).cast('I')[:] = array.array('I', obj)
|
|
self.bind_sints_to_ptr(b.va_addr, ptr=self.ptr + self.buf_info[i].offset + (0 if self.buf_info[i].type is BUFTYPE_BUF else 16), fmt='Q')
|
|
|
|
for i, v in enumerate(vals): self.bind_sints_to_ptr(v, ptr=self.ptr + self.args_info[i].offset, fmt='I')
|
|
|
|
class QCOMProgram(HCQProgram):
|
|
def __init__(self, dev: QCOMDevice, name: str, lib: bytes):
|
|
self.dev: QCOMDevice = dev
|
|
self.name, self.lib = name, lib
|
|
self._parse_lib()
|
|
|
|
self.lib_gpu: HCQBuffer = self.dev.allocator.alloc(self.image_size, options=BufferSpec(cpu_access=True, nolru=True))
|
|
to_mv(cast(int, self.lib_gpu.va_addr), self.image_size)[:] = self.image
|
|
|
|
self.pvtmem_size_per_item: int = round_up(self.pvtmem, 512) >> 9
|
|
self.pvtmem_size_total: int = self.pvtmem_size_per_item * 128 * 2
|
|
self.hw_stack_offset: int = round_up(next_power2(round_up(self.pvtmem, 512)) * 128 * 16, 0x1000)
|
|
self.shared_size: int = max(1, (self.shmem - 1) // 1024)
|
|
self.max_threads = min(1024, ((384 * 32) // (max(1, (self.fregs + round_up(self.hregs, 2) // 2)) * 128)) * 128)
|
|
dev._ensure_stack_size(self.hw_stack_offset * 4)
|
|
|
|
kernargs_alloc_size = round_up(2048 + (self.tex_cnt + self.ibo_cnt) * 0x40 + self.samp_cnt * 0x10, 0x100)
|
|
super().__init__(QCOMArgsState, self.dev, self.name, kernargs_alloc_size=kernargs_alloc_size)
|
|
|
|
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
|
if self.max_threads < prod(local_size): raise RuntimeError("Too many resources requested for launch")
|
|
if any(g*l>mx for g,l,mx in zip(global_size, local_size, [65536, 65536, 65536])) and any(l>mx for l,mx in zip(local_size, [1024, 1024, 1024])):
|
|
raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
|
|
return super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait)
|
|
|
|
def _parse_lib(self):
|
|
def _read_lib(off) -> int: return struct.unpack("I", self.lib[off:off+4])[0]
|
|
|
|
# Extract image binary
|
|
self.image_size = _read_lib(0x100)
|
|
self.image = bytearray(self.lib[(image_offset:=_read_lib(0xc0)):image_offset+self.image_size])
|
|
|
|
# Parse image descriptors
|
|
image_desc_off = _read_lib(0x110)
|
|
self.prg_offset, self.brnchstck = _read_lib(image_desc_off+0xc4), _read_lib(image_desc_off+0x108) // 2
|
|
self.pvtmem, self.shmem = _read_lib(image_desc_off+0xc8), _read_lib(image_desc_off+0xd8)
|
|
|
|
# Fill up constants and buffers info
|
|
self.buf_info, self.consts_info = [], []
|
|
|
|
# Collect sampler info.
|
|
self.samp_cnt = samp_cnt_in_file = _read_lib(image_desc_off + 0xdc)
|
|
assert self.samp_cnt <= 1, "Up to one sampler supported"
|
|
if self.samp_cnt:
|
|
self.samp_cnt += 1
|
|
self.samplers = [qreg.a6xx_tex_samp_0(wrap_s=(clamp_mode:=adreno.A6XX_TEX_CLAMP_TO_BORDER), wrap_t=clamp_mode, wrap_r=clamp_mode),
|
|
qreg.a6xx_tex_samp_1(unnorm_coords=True, cubemapseamlessfiltoff=True), 0, 0, 0, 0, 0, 0]
|
|
|
|
# Collect kernel arguments (buffers) info.
|
|
bdoff = round_up(image_desc_off + 0x158 + len(self.name), 4) + 8 * samp_cnt_in_file
|
|
while bdoff + 32 <= len(self.lib):
|
|
length, _, _, offset_words, _, _, _, typ = struct.unpack("IIIIIIII", self.lib[bdoff:bdoff+32])
|
|
if length == 0: break
|
|
self.buf_info.append(SimpleNamespace(offset=offset_words * 4, type=typ))
|
|
bdoff += length
|
|
|
|
# Setting correct offsets to textures/ibos.
|
|
self.tex_cnt, self.ibo_cnt = sum(x.type is BUFTYPE_TEX for x in self.buf_info), sum(x.type is BUFTYPE_IBO for x in self.buf_info)
|
|
self.ibo_off, self.tex_off, self.samp_off = 2048, 2048 + 0x40 * self.ibo_cnt, 2048 + 0x40 * self.tex_cnt + 0x40 * self.ibo_cnt
|
|
cur_ibo_off, cur_tex_off = self.ibo_off, self.tex_off
|
|
for x in self.buf_info:
|
|
if x.type is BUFTYPE_IBO: x.offset, cur_ibo_off = cur_ibo_off, cur_ibo_off + 0x40
|
|
elif x.type is BUFTYPE_TEX: x.offset, cur_tex_off = cur_tex_off, cur_tex_off + 0x40
|
|
|
|
if _read_lib(0xb0) != 0: # check if we have constants.
|
|
cdoff = _read_lib(0xac)
|
|
while cdoff + 40 <= image_offset:
|
|
cnst, offset_words, _, is32 = struct.unpack("I", self.lib[cdoff:cdoff+4])[0], *struct.unpack("III", self.lib[cdoff+16:cdoff+28])
|
|
self.consts_info.append((cnst, offset_words * (sz_bytes:=(2 << is32)), sz_bytes))
|
|
cdoff += 40
|
|
|
|
# Registers info
|
|
reg_desc_off = _read_lib(0x34)
|
|
self.fregs, self.hregs = _read_lib(reg_desc_off + 0x14), _read_lib(reg_desc_off + 0x18)
|
|
|
|
def __del__(self):
|
|
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, options=BufferSpec(cpu_access=True, nolru=True))
|
|
|
|
class QCOMTextureInfo:
|
|
def __init__(self, pitch:int, real_stride:int, desc:list[int], ibo:list[int]):
|
|
self.pitch, self.real_stride, self.desc, self.ibo = pitch, real_stride, desc, ibo
|
|
|
|
class QCOMAllocator(HCQAllocatorBase):
|
|
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
|
|
# Recalculate real size for texture
|
|
if options.image is not None:
|
|
imgw, imgh, itemsize_log = options.image.shape[1], options.image.shape[0], int(math.log2(options.image.itemsize))
|
|
pitchalign = max(6, 11 - int(math.log2(imgh))) if imgh > 1 else 6
|
|
align_up = max(1, (8 // itemsize_log + 1) - imgh // 32) if pitchalign == 6 else (2 ** (pitchalign - itemsize_log - 2))
|
|
|
|
granularity = 128 if options.image.itemsize == 4 else 256
|
|
pitch_add = (1 << pitchalign) if min(next_power2(imgw), round_up(imgw, granularity)) - align_up + 1 <= imgw and imgw > granularity//2 else 0
|
|
pitch = round_up((real_stride:=imgw * 4 * options.image.itemsize), 1 << pitchalign) + pitch_add
|
|
size = pitch * imgh
|
|
|
|
buf = HCQBuffer(options.external_ptr, size) if options.external_ptr else self.dev._gpu_alloc(size)
|
|
|
|
if options.image is not None:
|
|
tex_fmt = adreno.FMT6_32_32_32_32_FLOAT if options.image.itemsize == 4 else adreno.FMT6_16_16_16_16_FLOAT
|
|
desc = [qreg.a6xx_tex_const_0(0x8, swiz_x=0, swiz_y=1, swiz_z=2, swiz_w=3, fmt=tex_fmt), qreg.a6xx_tex_const_1(width=imgw, height=imgh),
|
|
qreg.a6xx_tex_const_2(type=adreno.A6XX_TEX_2D, pitch=pitch, pitchalign=pitchalign-6), 0,
|
|
*data64_le(buf.va_addr), qreg.a6xx_tex_const_6(plane_pitch=0x400000), qreg.a6xx_tex_const_7(13)]
|
|
|
|
buf.texture_info = QCOMTextureInfo(pitch, real_stride, desc, [desc[0] & (~0xffff), *desc[1:len(desc)]])
|
|
return buf
|
|
|
|
def _do_copy(self, src_addr, dest_addr, src_size, real_size, src_stride, dest_stride, dest_off=0, src_off=0):
|
|
while src_off < src_size:
|
|
ctypes.memmove(dest_addr+dest_off, src_addr+src_off, real_size)
|
|
src_off, dest_off = src_off+src_stride, dest_off+dest_stride
|
|
|
|
def _copyin(self, dest:HCQBuffer, src:memoryview):
|
|
stride, pitch = (src.nbytes, src.nbytes) if (ti:=cast(QCOMTextureInfo, dest.texture_info)) is None else (ti.real_stride, ti.pitch)
|
|
self._do_copy(mv_address(src), dest.va_addr, src.nbytes, stride, stride, pitch)
|
|
|
|
def _copyout(self, dest:memoryview, src:HCQBuffer):
|
|
self.dev.synchronize()
|
|
|
|
stride, pitch = (src.size, src.size) if (ti:=cast(QCOMTextureInfo, src.texture_info)) is None else (ti.real_stride, ti.pitch)
|
|
self._do_copy(src.va_addr, mv_address(dest), src.size, stride, pitch, stride)
|
|
|
|
def _as_buffer(self, src:HCQBuffer) -> memoryview:
|
|
self.dev.synchronize()
|
|
return to_mv(cast(int, src.va_addr), src.size)
|
|
|
|
def _free(self, opaque, options:BufferSpec):
|
|
self.dev.synchronize()
|
|
self.dev._gpu_free(opaque)
|
|
|
|
class QCOMDevice(HCQCompiled):
|
|
devices: ClassVar[list[HCQCompiled]] = []
|
|
signal_pages: ClassVar[list[Any]] = []
|
|
signal_pool: ClassVar[list[int]] = []
|
|
|
|
gpu_id: int = 0
|
|
dummy_addr: int = 0
|
|
|
|
def __init__(self, device:str=""):
|
|
self.fd = HWInterface('/dev/kgsl-3d0', os.O_RDWR)
|
|
QCOMDevice.dummy_addr = cast(int, self._gpu_alloc(0x1000).va_addr)
|
|
|
|
flags = kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT | kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC \
|
|
| kgsl.KGSL_CONTEXT_PRIORITY(8) | kgsl.KGSL_CONTEXT_PREEMPT_STYLE(kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN)
|
|
self.ctx = kgsl.IOCTL_KGSL_DRAWCTXT_CREATE(self.fd, flags=flags).drawctxt_id
|
|
|
|
self.cmd_buf = self._gpu_alloc(16 << 20)
|
|
self.cmd_buf_allocator = BumpAllocator(size=self.cmd_buf.size, base=cast(int, self.cmd_buf.va_addr), wrap=True)
|
|
|
|
self.border_color_buf = self._gpu_alloc(0x1000, fill_zeroes=True)
|
|
|
|
self.last_cmd:int = 0
|
|
|
|
# Set max power
|
|
struct.pack_into('IIQQ', pwr:=memoryview(bytearray(0x18)), 0, 1, self.ctx, mv_address(_:=memoryview(array.array('I', [1]))), 4)
|
|
kgsl.IOCTL_KGSL_SETPROPERTY(self.fd, type=kgsl.KGSL_PROP_PWR_CONSTRAINT, value=mv_address(pwr), sizebytes=pwr.nbytes)
|
|
|
|
# Load info about qcom device
|
|
info = kgsl.struct_kgsl_devinfo()
|
|
kgsl.IOCTL_KGSL_DEVICE_GETPROPERTY(self.fd, type=kgsl.KGSL_PROP_DEVICE_INFO, value=ctypes.addressof(info), sizebytes=ctypes.sizeof(info))
|
|
QCOMDevice.gpu_id = ((info.chip_id >> 24) & 0xFF) * 100 + ((info.chip_id >> 16) & 0xFF) * 10 + ((info.chip_id >> 8) & 0xFF)
|
|
if QCOMDevice.gpu_id >= 700: raise RuntimeError(f"Unsupported GPU: {QCOMDevice.gpu_id}")
|
|
|
|
super().__init__(device, QCOMAllocator(self), QCOMRenderer(), QCOMCompiler(device), functools.partial(QCOMProgram, self),
|
|
QCOMSignal, QCOMComputeQueue, None)
|
|
|
|
def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False) -> HCQBuffer:
|
|
flags |= kgsl.KGSL_MEMALIGN(alignment_hint:=12) | kgsl.KGSL_MEMFLAGS_USE_CPU_MAP
|
|
if uncached: flags |= kgsl.KGSL_CACHEMODE(kgsl.KGSL_CACHEMODE_UNCACHED)
|
|
|
|
alloc = kgsl.IOCTL_KGSL_GPUOBJ_ALLOC(self.fd, size=(bosz:=round_up(size, 1<<alignment_hint)), flags=flags, mmapsize=bosz)
|
|
va_addr = self.fd.mmap(0, bosz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED, alloc.id * 0x1000)
|
|
|
|
if fill_zeroes: ctypes.memset(va_addr, 0, size)
|
|
return HCQBuffer(va_addr=va_addr, size=size, meta=alloc)
|
|
|
|
def _gpu_free(self, mem:HCQBuffer):
|
|
kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.meta.id)
|
|
HWInterface.munmap(mem.va_addr, mem.meta.mmapsize)
|
|
|
|
def _ensure_stack_size(self, sz):
|
|
if not hasattr(self, '_stack'): self._stack = self._gpu_alloc(sz)
|
|
elif self._stack.size < sz:
|
|
self.synchronize()
|
|
self._gpu_free(self._stack)
|
|
self._stack = self._gpu_alloc(sz)
|
|
|