from __future__ import annotations
from typing import Any , cast , ClassVar
import os , ctypes , ctypes . util , struct , hashlib , functools , importlib , mmap , errno , array , contextlib , sys , select
assert sys . platform != ' win32 '
from dataclasses import dataclass
from tinygrad . runtime . support . hcq import HCQCompiled , HCQAllocator , HCQBuffer , HWQueue , CLikeArgsState , HCQSignal , HCQProgram , HWInterface
from tinygrad . ops import sint
from tinygrad . device import Compiled , ProfileEvent , BufferSpec , CPUProgram , PROFILE
from tinygrad . helpers import getenv , to_mv , round_up , data64_le , mv_address , all_same , flatten , DEBUG , OSX
from tinygrad . renderer . cstyle import AMDRenderer
from tinygrad . renderer . llvmir import AMDLLVMRenderer
from tinygrad . runtime . autogen import kfd , hsa , libc , pci , vfio , sqtt
from tinygrad . runtime . autogen . am import am
from tinygrad . runtime . support . compiler_amd import HIPCompiler , AMDLLVMCompiler
from tinygrad . runtime . support . elf import elf_loader
from tinygrad . runtime . support . am . amdev import AMDev , AMMapping
from tinygrad . runtime . support . amd import AMDRegBase , collect_registers , import_module
if getenv ( " IOCTL " ) : import extra . hip_gpu_driver . hip_ioctl # noqa: F401 # pylint: disable=unused-import
EVENT_INDEX_PARTIAL_FLUSH = 4 # based on a comment in nvd.h
WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
WAIT_REG_MEM_FUNCTION_NEQ = 4 # !=
WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
class AMDSignal ( HCQSignal ) :
def __init__ ( self , base_addr : int | None = None , * * kwargs ) :
super ( ) . __init__ ( base_addr , * * kwargs , timestamp_divider = 100 , dev_t = AMDDevice )
def _sleep ( self , time_spent_waiting_ms : int ) :
# Resonable to sleep for long workloads (which take more than 2s) and only timeline signals.
if time_spent_waiting_ms > 2000 and self . timeline_for_device is not None : self . timeline_for_device . dev_iface . sleep ( 200 )
class AMDComputeQueue ( HWQueue ) :
def __init__ ( self , dev : AMDDevice ) :
self . dev , self . soc , self . pm4 , self . gc , self . nbio = dev , dev . soc , dev . pm4 , dev . gc , dev . nbio
super ( ) . __init__ ( )
def __del__ ( self ) :
if self . binded_device is not None :
self . binded_device . allocator . free ( self . hw_page , self . hw_page . size , BufferSpec ( cpu_access = True , nolru = True , uncached = True ) )
def pkt3 ( self , cmd , * vals ) : self . q ( self . pm4 . PACKET3 ( cmd , len ( vals ) - 1 ) , * vals )
def wreg ( self , reg : AMDReg , * args : sint , * * kwargs : int ) :
if bool ( args ) == bool ( kwargs ) : raise RuntimeError ( ' One (and only one) of *args or **kwargs must be specified ' )
if self . pm4 . PACKET3_SET_SH_REG_START < = reg . addr < self . pm4 . PACKET3_SET_SH_REG_END :
set_packet , set_packet_start = self . pm4 . PACKET3_SET_SH_REG , self . pm4 . PACKET3_SET_SH_REG_START
elif self . pm4 . PACKET3_SET_UCONFIG_REG_START < = reg . addr < self . pm4 . PACKET3_SET_UCONFIG_REG_START + 2 * * 16 - 1 :
set_packet , set_packet_start = self . pm4 . PACKET3_SET_UCONFIG_REG , self . pm4 . PACKET3_SET_UCONFIG_REG_START
else : raise RuntimeError ( f ' Cannot set { reg . name } ( { reg . addr } ) via pm4 packet ' )
self . pkt3 ( set_packet , reg . addr - set_packet_start , * ( args or ( reg . encode ( * * kwargs ) , ) ) )
@contextlib . contextmanager
def pred_exec ( self , xcc_mask : int ) :
if self . dev . xccs > 1 :
self . pkt3 ( self . pm4 . PACKET3_PRED_EXEC , xcc_mask << 24 )
prev_len = len ( self . _q )
yield
if self . dev . xccs > 1 :
self . _q [ prev_len - 1 ] | = ( len ( self . _q ) - prev_len )
def sqtt_userdata ( self , data , * extra_dwords ) :
data_ints = [ x [ 0 ] for x in struct . iter_unpack ( ' <I ' , bytes ( data ) ) ] + list ( extra_dwords )
for i in range ( 0 , len ( data_ints ) , 2 ) :
self . wreg ( self . gc . regSQ_THREAD_TRACE_USERDATA_2 , * data_ints [ i : i + 2 ] )
def wait_reg_mem ( self , value , mask = 0xffffffff , mem = None , reg_req = None , reg_done = None ) :
wrm_info_dw = self . pm4 . WAIT_REG_MEM_MEM_SPACE ( int ( mem is not None ) ) | self . pm4 . WAIT_REG_MEM_OPERATION ( int ( mem is None ) ) \
| self . pm4 . WAIT_REG_MEM_FUNCTION ( WAIT_REG_MEM_FUNCTION_GEQ ) | self . pm4 . WAIT_REG_MEM_ENGINE ( 0 )
self . pkt3 ( self . pm4 . PACKET3_WAIT_REG_MEM , wrm_info_dw , * ( data64_le ( mem ) if mem is not None else ( reg_req , reg_done ) ) , value , mask , 4 )
def acquire_mem ( self , addr = 0x0 , sz = ( 1 << 64 ) - 1 , gli = 1 , glm = 1 , glk = 1 , glv = 1 , gl1 = 1 , gl2 = 1 ) :
if self . dev . gfxver > = 10 :
cache_flags_dw = self . pm4 . PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV ( gli ) \
| self . pm4 . PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_INV ( glm ) | self . pm4 . PACKET3_ACQUIRE_MEM_GCR_CNTL_GLM_WB ( glm ) \
| self . pm4 . PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV ( glk ) | self . pm4 . PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_WB ( glk ) \
| self . pm4 . PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV ( glv ) | self . pm4 . PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV ( gl1 ) \
| self . pm4 . PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV ( gl2 ) | self . pm4 . PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_WB ( gl2 )
self . pkt3 ( self . pm4 . PACKET3_ACQUIRE_MEM , 0 , * data64_le ( sz ) , * data64_le ( addr ) , 0 , cache_flags_dw )
else :
cp_coher_cntl = self . pm4 . PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_ICACHE_ACTION_ENA ( gli ) | \
self . pm4 . PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_SH_KCACHE_ACTION_ENA ( glk ) | \
self . pm4 . PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_ACTION_ENA ( 1 ) | \
self . pm4 . PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TCL1_ACTION_ENA ( 1 ) | \
self . pm4 . PACKET3_ACQUIRE_MEM_CP_COHER_CNTL_TC_WB_ACTION_ENA ( 1 )
self . pkt3 ( self . pm4 . PACKET3_ACQUIRE_MEM , cp_coher_cntl , * data64_le ( sz ) , * data64_le ( addr ) , 0x0000000A )
def release_mem ( self , address = 0x0 , value = 0 , data_sel = 0 , int_sel = 2 , ctxid = 0 , cache_flush = False ) :
if self . dev . gfxver > = 10 :
cache_flags_dw = 0 if not cache_flush else ( self . pm4 . PACKET3_RELEASE_MEM_GCR_GLV_INV | self . pm4 . PACKET3_RELEASE_MEM_GCR_GL1_INV \
| self . pm4 . PACKET3_RELEASE_MEM_GCR_GL2_INV | self . pm4 . PACKET3_RELEASE_MEM_GCR_GLM_WB \
| self . pm4 . PACKET3_RELEASE_MEM_GCR_GLM_INV | self . pm4 . PACKET3_RELEASE_MEM_GCR_GL2_WB | self . pm4 . PACKET3_RELEASE_MEM_GCR_SEQ )
event_dw = self . pm4 . PACKET3_RELEASE_MEM_EVENT_TYPE ( self . pm4 . CACHE_FLUSH_AND_INV_TS_EVENT ) \
| self . pm4 . PACKET3_RELEASE_MEM_EVENT_INDEX ( self . pm4 . event_index__mec_release_mem__end_of_pipe )
memsel_dw = self . pm4 . PACKET3_RELEASE_MEM_DATA_SEL ( data_sel ) | self . pm4 . PACKET3_RELEASE_MEM_INT_SEL ( int_sel ) \
| self . pm4 . PACKET3_RELEASE_MEM_DST_SEL ( 0 )
else :
cache_flags_dw = 0 if not cache_flush else ( self . pm4 . EOP_TC_WB_ACTION_EN | self . pm4 . EOP_TC_NC_ACTION_EN )
event_dw = self . pm4 . EVENT_TYPE ( self . pm4 . CACHE_FLUSH_AND_INV_TS_EVENT ) | self . pm4 . EVENT_INDEX ( self . pm4 . event_index__mec_release_mem__end_of_pipe )
memsel_dw = self . pm4 . DATA_SEL ( data_sel ) | self . pm4 . INT_SEL ( int_sel )
ctxid = 0
self . pkt3 ( self . pm4 . PACKET3_RELEASE_MEM , event_dw | cache_flags_dw , memsel_dw , * data64_le ( address ) , * data64_le ( value ) , ctxid )
def xcc_barrier ( self ) :
if self . dev . xcc_sync is None : return self
assert self . dev . xccs == 8 , ' only 8 XCCs supported '
a , b = self . dev . xcc_sync
mem_eq = self . pm4 . WAIT_REG_MEM_FUNCTION ( WAIT_REG_MEM_FUNCTION_EQ ) | self . pm4 . WAIT_REG_MEM_MEM_SPACE ( 1 )
self . pkt3 ( self . pm4 . PACKET3_ATOMIC_MEM , self . soc . TC_OP_ATOMIC_ADD_RTN_32 , * data64_le ( a . value_addr ) , * data64_le ( 1 ) , * data64_le ( 0 ) , 10 ) # a += 1
self . pkt3 ( self . pm4 . PACKET3_WAIT_REG_MEM , mem_eq , * data64_le ( a . value_addr ) , 0 , 0b111 , 10 ) # a == 0 (mod 8) via bitmask
self . pkt3 ( self . pm4 . PACKET3_ATOMIC_MEM , self . soc . TC_OP_ATOMIC_ADD_RTN_32 , * data64_le ( b . value_addr ) , * data64_le ( 1 ) , * data64_le ( 0 ) , 10 ) # b += 1
self . pkt3 ( self . pm4 . PACKET3_WAIT_REG_MEM , mem_eq , * data64_le ( b . value_addr ) , 0 , 0b111 , 10 ) # b == 0 (mod 8) via bitmask
return self
def memory_barrier ( self ) :
pf = 0 if self . nbio . version [ : 2 ] != ( 7 , 11 ) else 1
self . wait_reg_mem ( reg_req = getattr ( self . nbio , f ' regBIF_BX_PF { pf } _GPU_HDP_FLUSH_REQ ' ) . addr ,
reg_done = getattr ( self . nbio , f ' regBIF_BX_PF { pf } _GPU_HDP_FLUSH_DONE ' ) . addr , value = 0xffffffff )
self . acquire_mem ( )
return self
def xcc_config ( self ) :
self . wreg ( self . gc . regCOMPUTE_TG_CHUNK_SIZE , 1 )
for xcc_id in range ( self . dev . xccs ) :
with self . pred_exec ( xcc_mask = 1 << xcc_id ) :
self . wreg ( self . gc . regCOMPUTE_CURRENT_LOGIC_XCC_ID , xcc_id )
return self
def spi_config ( self , tracing : bool ) :
self . wreg ( self . gc . regSPI_CONFIG_CNTL , ps_pkr_priority_cntl = 3 , exp_priority_order = 3 , gpr_write_priority = 0x2c688 ,
enable_sqg_bop_events = int ( tracing ) , enable_sqg_top_events = int ( tracing ) )
def sqtt_config ( self , tracing : bool ) :
self . wreg ( self . gc . regSQ_THREAD_TRACE_CTRL , draw_event_en = 1 , spi_stall_en = 1 , sq_stall_en = 1 , reg_at_hwm = 2 , hiwater = 1 ,
rt_freq = self . soc . SQ_TT_RT_FREQ_4096_CLK , util_timer = self . soc . SQ_TT_UTIL_TIMER_250_CLK , mode = int ( tracing ) )
# Magic values from mesa/src/amd/vulkan/radv_sqtt.c:radv_emit_spi_config_cntl and src/amd/common/ac_sqtt.c:ac_sqtt_emit_start
def start_trace ( self , buf0s : list [ HCQBuffer ] , se_mask : int ) :
self . memory_barrier ( )
self . spi_config ( tracing = True )
# One buffer for one SE, mesa does it with a single buffer and ac_sqtt_get_data_offset, but this is simpler and should work just as well
for se in range ( len ( buf0s ) ) :
self . wreg ( self . gc . regGRBM_GFX_INDEX , se_index = se , instance_broadcast_writes = 1 )
buf0_lo , buf0_hi = data64_le ( buf0s [ se ] . va_addr >> 12 )
self . wreg ( self . gc . regSQ_THREAD_TRACE_BUF0_SIZE , base_hi = buf0_hi , size = buf0s [ se ] . size >> 12 )
self . wreg ( self . gc . regSQ_THREAD_TRACE_BUF0_BASE , base_lo = buf0_lo )
# NOTE: SQTT can only trace instructions on one simd per se, this selects first simd in first wgp in first sa.
# For RGP to display instruction trace it has to see it on first SE. Howerver ACE/MEC/whatever does the dispatching starting with second se,
# and on amdgpu/non-AM it also does weird things with dispatch order inside se: around 7 times out of 10 it starts from the last cu, but
# sometimes not, especially if the kernel has more than one wavefront which means that kernels with small global size might get unlucky and
# be dispatched on something else and not be seen in instruction tracing tab. You can force the wavefronts of a kernel to be dispatched on the
# CUs you want to by disabling other CUs via bits in regCOMPUTE_STATIC_THREAD_MGMT_SE<x> and trace even kernels that only have one wavefront.
self . wreg ( self . gc . regSQ_THREAD_TRACE_MASK , wtype_include = self . soc . SQ_TT_WTYPE_INCLUDE_CS_BIT , simd_sel = 0 , wgp_sel = 0 , sa_sel = 0 )
REG_INCLUDE = self . soc . SQ_TT_TOKEN_MASK_SQDEC_BIT | self . soc . SQ_TT_TOKEN_MASK_SHDEC_BIT | self . soc . SQ_TT_TOKEN_MASK_GFXUDEC_BIT | \
self . soc . SQ_TT_TOKEN_MASK_COMP_BIT | self . soc . SQ_TT_TOKEN_MASK_CONTEXT_BIT | self . soc . SQ_TT_TOKEN_MASK_CONTEXT_BIT
TOKEN_EXCLUDE = 1 << self . soc . SQ_TT_TOKEN_EXCLUDE_PERF_SHIFT
if not ( se_mask >> se ) & 0b1 :
TOKEN_EXCLUDE | = 1 << self . soc . SQ_TT_TOKEN_EXCLUDE_VMEMEXEC_SHIFT | 1 << self . soc . SQ_TT_TOKEN_EXCLUDE_ALUEXEC_SHIFT | \
1 << self . soc . SQ_TT_TOKEN_EXCLUDE_VALUINST_SHIFT | 1 << self . soc . SQ_TT_TOKEN_EXCLUDE_IMMEDIATE_SHIFT | \
1 << self . soc . SQ_TT_TOKEN_EXCLUDE_INST_SHIFT
self . wreg ( self . gc . regSQ_THREAD_TRACE_TOKEN_MASK , reg_include = REG_INCLUDE , token_exclude = TOKEN_EXCLUDE , bop_events_token_include = 1 )
# Enable SQTT
self . sqtt_config ( tracing = True )
# Restore global broadcasting
self . wreg ( self . gc . regGRBM_GFX_INDEX , se_broadcast_writes = 1 , sa_broadcast_writes = 1 , instance_broadcast_writes = 1 )
self . wreg ( self . gc . regCOMPUTE_THREAD_TRACE_ENABLE , 1 )
self . memory_barrier ( )
return self
# Magic values from src/amd/common/ac_sqtt.c:ac_sqtt_emit_stop and src/amd/common/ac_sqtt.c:ac_sqtt_emit_wait
def stop_trace ( self , ses : int , wptrs : HCQBuffer ) :
self . memory_barrier ( )
# Start shutting everything down
self . wreg ( self . gc . regCOMPUTE_THREAD_TRACE_ENABLE , 0 )
self . pkt3 ( self . pm4 . PACKET3_EVENT_WRITE , self . pm4 . EVENT_TYPE ( self . soc . THREAD_TRACE_FINISH ) | self . pm4 . EVENT_INDEX ( 0 ) )
# For each SE wait for finish to complete and copy regSQ_THREAD_TRACE_WPTR to know where in the buffer trace data ends
for se in range ( ses ) :
self . wreg ( self . gc . regGRBM_GFX_INDEX , se_index = se , instance_broadcast_writes = 1 )
# Wait for FINISH_PENDING==0
self . pkt3 ( self . pm4 . PACKET3_WAIT_REG_MEM , self . pm4 . WAIT_REG_MEM_FUNCTION ( WAIT_REG_MEM_FUNCTION_EQ ) ,
self . gc . regSQ_THREAD_TRACE_STATUS . addr , 0 , 0 , self . gc . SQ_THREAD_TRACE_STATUS__FINISH_PENDING_MASK , 4 )
# Wait for FINISH_DONE!=0
self . pkt3 ( self . pm4 . PACKET3_WAIT_REG_MEM , self . pm4 . WAIT_REG_MEM_FUNCTION ( WAIT_REG_MEM_FUNCTION_NEQ ) ,
self . gc . regSQ_THREAD_TRACE_STATUS . addr , 0 , 0 , self . gc . SQ_THREAD_TRACE_STATUS__FINISH_DONE_MASK , 4 )
# Disable SQTT
self . sqtt_config ( tracing = False )
# Wait for BUSY==0
self . pkt3 ( self . pm4 . PACKET3_WAIT_REG_MEM , self . pm4 . WAIT_REG_MEM_FUNCTION ( WAIT_REG_MEM_FUNCTION_EQ ) ,
self . gc . regSQ_THREAD_TRACE_STATUS . addr , 0 , 0 , self . gc . SQ_THREAD_TRACE_STATUS__BUSY_MASK , 4 )
# Copy WPTR to memory (src_sel = perf, dst_sel = tc_l2, wr_confirm = True)
self . pkt3 ( self . pm4 . PACKET3_COPY_DATA , 1 << 20 | 2 << 8 | 4 , self . gc . regSQ_THREAD_TRACE_WPTR . addr , 0 , * data64_le ( wptrs . va_addr + ( se * 4 ) ) )
# Restore global broadcasting
self . wreg ( self . gc . regGRBM_GFX_INDEX , se_broadcast_writes = 1 , sa_broadcast_writes = 1 , instance_broadcast_writes = 1 )
self . spi_config ( tracing = False )
self . memory_barrier ( )
return self
def exec ( self , prg : AMDProgram , args_state : CLikeArgsState , global_size : tuple [ sint , . . . ] , local_size : tuple [ sint , . . . ] ) :
self . bind_args_state ( args_state )
self . acquire_mem ( gli = 0 , gl2 = 0 )
if prg . enable_private_segment_sgpr :
assert self . dev . xccs == 1 , " Only architected flat scratch is suppored on multi-xcc "
scratch_hilo = data64_le ( prg . dev . scratch . va_addr )
# sgpr word1 bit31 enables swizzle
# sgpr word3 = 0x14 << 12 | 2 << 28 | 2 << 21 | 1 << 23
user_regs = [ scratch_hilo [ 0 ] , scratch_hilo [ 1 ] | 1 << 31 , 0xffffffff , 0x20c14000 ] if prg . enable_private_segment_sgpr else [ ]
else : user_regs = [ ]
if prg . enable_dispatch_ptr :
dp = hsa . hsa_kernel_dispatch_packet_t . from_address ( dp_addr := args_state . ptr + prg . kernargs_segment_size )
self . bind_sints ( * local_size , struct = dp , start_field = ' workgroup_size_x ' , fmt = ' H ' )
self . bind_sints ( * [ g * l for g , l in zip ( global_size , local_size ) ] , struct = dp , start_field = ' grid_size_x ' , fmt = ' I ' )
dp . group_segment_size , dp . private_segment_size , dp . kernarg_address = prg . group_segment_size , prg . private_segment_size , args_state . ptr
user_regs + = [ * data64_le ( dp_addr ) ]
user_regs + = [ * data64_le ( args_state . ptr ) ]
if prg . dev . sqtt_enabled :
self . sqtt_userdata ( sqtt . struct_rgp_sqtt_marker_pipeline_bind (
_0 = sqtt . union_rgp_sqtt_marker_pipeline_bind_0 ( _0 = sqtt . struct_rgp_sqtt_marker_pipeline_bind_0_0 (
identifier = sqtt . RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE ,
bind_point = 1 , # compute
) ) ,
_1 = sqtt . union_rgp_sqtt_marker_pipeline_bind_1 ( api_pso_hash = data64_le ( prg . libhash [ 0 ] ) ) ,
) )
self . sqtt_userdata ( sqtt . struct_rgp_sqtt_marker_event (
_0 = sqtt . union_rgp_sqtt_marker_event_0 ( _0 = sqtt . struct_rgp_sqtt_marker_event_0_0 ( has_thread_dims = 1 ) ) ,
_2 = sqtt . union_rgp_sqtt_marker_event_2 ( cmd_id = prg . dev . cmd_id ) ,
) , * global_size )
prg . dev . cmd_id + = 1
self . wreg ( self . gc . regCOMPUTE_PGM_LO , * data64_le ( prg . prog_addr >> 8 ) )
self . wreg ( self . gc . regCOMPUTE_PGM_RSRC1 , prg . rsrc1 , prg . rsrc2 )
self . wreg ( self . gc . regCOMPUTE_PGM_RSRC3 , prg . rsrc3 )
self . wreg ( self . gc . regCOMPUTE_TMPRING_SIZE , prg . dev . tmpring_size )
if prg . dev . has_scratch_base_registers :
for xcc_id in range ( self . dev . xccs ) :
with self . pred_exec ( xcc_mask = 1 << xcc_id ) :
scratch_base = prg . dev . scratch . va_addr + ( prg . dev . scratch . size / / self . dev . xccs * xcc_id )
self . wreg ( self . gc . regCOMPUTE_DISPATCH_SCRATCH_BASE_LO , * data64_le ( scratch_base >> 8 ) )
if 100000 < = prg . dev . target < 110000 : self . wreg ( self . gc . mmCP_COHER_START_DELAY , 0x20 )
self . wreg ( self . gc . regCOMPUTE_RESTART_X , 0 , 0 , 0 )
self . wreg ( self . gc . regCOMPUTE_STATIC_THREAD_MGMT_SE0 , 0xFFFFFFFF , 0xFFFFFFFF )
self . wreg ( self . gc . regCOMPUTE_STATIC_THREAD_MGMT_SE2 , 0xFFFFFFFF , 0xFFFFFFFF )
if prg . dev . target > = 100000 :
self . wreg ( self . gc . regCOMPUTE_STATIC_THREAD_MGMT_SE4 , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF , 0xFFFFFFFF )
self . wreg ( self . gc . regCOMPUTE_USER_DATA_0 , * user_regs )
self . wreg ( self . gc . regCOMPUTE_START_X , 0 , 0 , 0 , * local_size , 0 , 0 )
self . wreg ( self . gc . regCOMPUTE_RESOURCE_LIMITS , 0 )
gfx10p = { ' cs_w32_en ' : int ( prg . wave32 ) } if prg . dev . target > = 100000 else { }
DISPATCH_INITIATOR = self . gc . regCOMPUTE_DISPATCH_INITIATOR . encode ( * * gfx10p , force_start_at_000 = 1 , compute_shader_en = 1 )
self . pkt3 ( self . pm4 . PACKET3_DISPATCH_DIRECT , * global_size , DISPATCH_INITIATOR )
if prg . dev . sqtt_enabled : self . pkt3 ( self . pm4 . PACKET3_EVENT_WRITE , self . pm4 . EVENT_TYPE ( self . soc . THREAD_TRACE_MARKER ) | self . pm4 . EVENT_INDEX ( 0 ) )
self . pkt3 ( self . pm4 . PACKET3_EVENT_WRITE , self . pm4 . EVENT_TYPE ( self . soc . CS_PARTIAL_FLUSH ) | self . pm4 . EVENT_INDEX ( EVENT_INDEX_PARTIAL_FLUSH ) )
if self . dev . xccs > 1 : self . release_mem ( cache_flush = True )
self . xcc_barrier ( )
return self
def wait ( self , signal : AMDSignal , value : sint = 0 ) :
self . wait_reg_mem ( mem = signal . value_addr , value = value , mask = 0xffffffff )
self . xcc_barrier ( )
return self
def timestamp ( self , signal : AMDSignal ) :
with self . pred_exec ( xcc_mask = 0b1 ) :
self . release_mem ( signal . timestamp_addr , 0 , self . pm4 . data_sel__mec_release_mem__send_gpu_clock_counter , self . pm4 . int_sel__mec_release_mem__none )
return self
def signal ( self , signal : AMDSignal , value : sint = 0 ) :
with self . pred_exec ( xcc_mask = 0b1 ) :
# NOTE: this needs an EOP buffer on the queue or it will NULL pointer
self . release_mem ( signal . value_addr , value , self . pm4 . data_sel__mec_release_mem__send_32_bit_low ,
self . pm4 . int_sel__mec_release_mem__send_interrupt_after_write_confirm , cache_flush = True )
if not AMDDevice . driverless and ( dev := signal . timeline_for_device ) is not None :
self . release_mem ( dev . queue_event_mailbox_ptr , dev . queue_event . event_id , self . pm4 . data_sel__mec_release_mem__send_32_bit_low ,
self . pm4 . int_sel__mec_release_mem__send_interrupt_after_write_confirm , ctxid = dev . queue_event . event_id )
return self
def bind ( self , dev : AMDDevice ) :
self . binded_device = dev
self . hw_page = dev . allocator . alloc ( len ( self . _q ) * 4 , BufferSpec ( cpu_access = True , nolru = True , uncached = True ) )
hw_view = to_mv ( self . hw_page . va_addr , self . hw_page . size ) . cast ( " I " )
for i , value in enumerate ( self . _q ) : hw_view [ i ] = value
self . indirect_cmd = [ self . pm4 . PACKET3 ( self . pm4 . PACKET3_INDIRECT_BUFFER , 2 ) , * data64_le ( self . hw_page . va_addr ) ,
len ( self . _q ) | self . pm4 . INDIRECT_BUFFER_VALID ]
self . _q = hw_view
return self
def _submit ( self , dev : AMDDevice ) :
cmds = self . indirect_cmd if dev == self . binded_device else self . _q
# WORKAROUND: PACKET3_PRED_EXEC doesn't work in rings, only in IBs, create a fake IB inside a ring to work around that
if self . dev . xccs > 1 and dev != self . binded_device :
ib_end = ( ( dev . compute_queue . put_value + 5 ) % len ( dev . compute_queue . ring ) ) + len ( cmds )
ib_pad = len ( dev . compute_queue . ring ) - ( ib_end - len ( cmds ) ) if ib_end > len ( dev . compute_queue . ring ) else 0
ib_ptr = mv_address ( dev . compute_queue . ring ) + ( ( dev . compute_queue . put_value + 5 + ib_pad ) % len ( dev . compute_queue . ring ) ) * 4
cmds = [ self . pm4 . PACKET3 ( self . pm4 . PACKET3_INDIRECT_BUFFER , 2 ) , * data64_le ( ib_ptr ) , len ( cmds ) | self . pm4 . INDIRECT_BUFFER_VALID ,
self . pm4 . PACKET3 ( self . pm4 . PACKET3_NOP , ib_pad + len ( cmds ) - 1 ) , * ( ( 0 , ) * ib_pad ) , * cmds ]
for i , value in enumerate ( cmds ) : dev . compute_queue . ring [ ( dev . compute_queue . put_value + i ) % len ( dev . compute_queue . ring ) ] = value
dev . compute_queue . put_value + = len ( cmds )
dev . compute_queue . signal_doorbell ( dev )
class AMDCopyQueue ( HWQueue ) :
def __init__ ( self , dev , max_copy_size = 0x40000000 ) :
self . dev , self . sdma , self . internal_cmd_sizes , self . max_copy_size = dev , dev . sdma , [ ] , max_copy_size
super ( ) . __init__ ( )
def q ( self , * arr ) :
super ( ) . q ( * arr )
self . internal_cmd_sizes . append ( len ( arr ) )
def copy ( self , dest : sint , src : sint , copy_size : int ) :
copied , copy_commands = 0 , ( copy_size + self . max_copy_size - 1 ) / / self . max_copy_size
for _ in range ( copy_commands ) :
step_copy_size = min ( copy_size - copied , self . max_copy_size )
self . q ( self . sdma . SDMA_OP_COPY | self . sdma . SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP ( self . sdma . SDMA_SUBOP_COPY_LINEAR ) ,
self . sdma . SDMA_PKT_COPY_LINEAR_COUNT_COUNT ( step_copy_size - 1 ) , 0 , * data64_le ( src + copied ) , * data64_le ( dest + copied ) )
copied + = step_copy_size
return self
def signal ( self , signal : AMDSignal , value : sint = 0 ) :
fence_flags = self . sdma . SDMA_PKT_FENCE_HEADER_MTYPE ( 3 ) if self . dev . gfxver > = 10 else 0
self . q ( self . sdma . SDMA_OP_FENCE | fence_flags , * data64_le ( signal . value_addr ) , value )
self . q ( self . sdma . SDMA_OP_FENCE , * data64_le ( signal . value_addr ) , value )
if not AMDDevice . driverless and ( dev := signal . timeline_for_device ) is not None :
self . q ( self . sdma . SDMA_OP_FENCE | fence_flags , * data64_le ( dev . queue_event_mailbox_ptr ) , dev . queue_event . event_id )
self . q ( self . sdma . SDMA_OP_TRAP , self . sdma . SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT ( dev . queue_event . event_id ) )
elif AMDDevice . driverless : self . q ( self . sdma . SDMA_OP_TRAP , self . sdma . SDMA_PKT_TRAP_INT_CONTEXT_INT_CONTEXT ( 0 ) )
return self
def wait ( self , signal : AMDSignal , value : sint = 0 ) :
self . q ( self . sdma . SDMA_OP_POLL_REGMEM | self . sdma . SDMA_PKT_POLL_REGMEM_HEADER_FUNC ( WAIT_REG_MEM_FUNCTION_GEQ ) | \
self . sdma . SDMA_PKT_POLL_REGMEM_HEADER_MEM_POLL ( 1 ) , * data64_le ( signal . value_addr ) , value , 0xffffffff ,
self . sdma . SDMA_PKT_POLL_REGMEM_DW5_INTERVAL ( 0x04 ) | self . sdma . SDMA_PKT_POLL_REGMEM_DW5_RETRY_COUNT ( 0xfff ) )
return self
def timestamp ( self , signal : AMDSignal ) :
self . q ( self . sdma . SDMA_OP_TIMESTAMP | self . sdma . SDMA_PKT_TIMESTAMP_GET_HEADER_SUB_OP ( self . sdma . SDMA_SUBOP_TIMESTAMP_GET_GLOBAL ) ,
* data64_le ( signal . timestamp_addr ) )
return self
def bind ( self , dev : AMDDevice ) :
if not getenv ( " AMD_SDMA_BIND " , 0 ) or not dev . driverless : return
self . binded_device = dev
self . hw_page = dev . allocator . alloc ( ( qsz := round_up ( len ( self . _q ) , 8 ) ) * 4 , BufferSpec ( cpu_access = True , nolru = True , uncached = True ) )
hw_view = to_mv ( self . hw_page . va_addr , self . hw_page . size ) . cast ( " I " )
for i in range ( qsz ) : hw_view [ i ] = self . _q [ i ] if i < len ( self . _q ) else 0
self . indirect_cmd = [ self . sdma . SDMA_OP_INDIRECT | self . sdma . SDMA_PKT_INDIRECT_HEADER_VMID ( 0 ) , * data64_le ( self . hw_page . va_addr ) , qsz ,
* data64_le ( 0 ) ]
self . _q , self . cmd_sizes = hw_view , [ len ( self . indirect_cmd ) ]
def _submit ( self , dev : AMDDevice ) :
if dev . sdma_queue . put_value - dev . sdma_queue . read_ptr > dev . sdma_queue . ring . nbytes : raise RuntimeError ( " SDMA queue overrun " )
if self . binded_device == dev :
# An IB packet must end on a 8 DW boundary.
add = ( 8 - ( ( ( dev . sdma_queue . put_value % 32 ) / / 4 ) + len ( self . indirect_cmd ) % 8 ) ) % 8
cmds , cmd_sizes = ( [ 0 ] * add ) + self . indirect_cmd , [ len ( self . indirect_cmd ) + add ]
if len ( cmds ) * 4 > = ( dev . sdma_queue . ring . nbytes - dev . sdma_queue . put_value % dev . sdma_queue . ring . nbytes ) :
cmds , cmd_sizes = [ 0 , 0 ] + self . indirect_cmd , [ 8 ]
else : cmds , cmd_sizes = self . _q , self . internal_cmd_sizes
tail_blit_dword = 0
for cmdsz in cmd_sizes :
if ( tail_blit_dword + cmdsz ) * 4 > = dev . sdma_queue . ring . nbytes - dev . sdma_queue . put_value % dev . sdma_queue . ring . nbytes : break
tail_blit_dword + = cmdsz
start_idx = ( dev . sdma_queue . put_value % dev . sdma_queue . ring . nbytes ) / / 4
dev . sdma_queue . ring [ start_idx : start_idx + tail_blit_dword ] = array . array ( ' I ' , cmds [ : tail_blit_dword ] )
dev . sdma_queue . put_value + = tail_blit_dword * 4
if ( rem_packet_cnt := len ( cmds ) - tail_blit_dword ) > 0 :
zero_fill = dev . sdma_queue . ring . nbytes - dev . sdma_queue . put_value % dev . sdma_queue . ring . nbytes
ctypes . memset ( mv_address ( dev . sdma_queue . ring ) + ( dev . sdma_queue . put_value % dev . sdma_queue . ring . nbytes ) , 0 , zero_fill )
dev . sdma_queue . put_value + = zero_fill
dev . sdma_queue . ring [ 0 : rem_packet_cnt ] = array . array ( ' I ' , cmds [ tail_blit_dword : ] )
dev . sdma_queue . put_value + = rem_packet_cnt * 4
dev . sdma_queue . signal_doorbell ( dev )
class AMDProgram ( HCQProgram ) :
def __init__ ( self , dev : AMDDevice , name : str , lib : bytes ) :
# TODO; this API needs the type signature of the function and global_size/local_size
self . dev : AMDDevice = dev
self . name , self . lib = name , lib
image , sections , _ = elf_loader ( self . lib )
self . lib_gpu = self . dev . allocator . alloc ( round_up ( image . nbytes , 0x1000 ) , BufferSpec ( cpu_access = True , nolru = True ) )
ctypes . memmove ( self . lib_gpu . va_addr , mv_address ( image ) , image . nbytes )
rodata_entry = next ( ( sh . header . sh_addr for sh in sections if sh . name == " .rodata " ) , - 1 )
text_entry = next ( ( sh . header . sh_addr for sh in sections if sh . name == " .text " ) , - 1 )
assert rodata_entry > = 0 and text_entry > = 0 , " .text or .rodata section not found "
self . group_segment_size = image [ rodata_entry : rodata_entry + 4 ] . cast ( " I " ) [ 0 ]
self . private_segment_size = image [ rodata_entry + 4 : rodata_entry + 8 ] . cast ( " I " ) [ 0 ]
self . kernargs_segment_size = image [ rodata_entry + 8 : rodata_entry + 12 ] . cast ( " I " ) [ 0 ]
lds_size = ( ( self . group_segment_size + 511 ) / / 512 ) & 0x1FF
if lds_size > ( self . dev . dev_iface . props [ ' lds_size_in_kb ' ] * 1024 ) / / 512 : raise RuntimeError ( " Too many resources requested: group_segment_size " )
# Ensure scratch size
self . dev . _ensure_has_local_memory ( self . private_segment_size )
code = hsa . amd_kernel_code_t . from_address ( self . lib_gpu . va_addr + rodata_entry ) # NOTE: this is wrong, it's not this object
self . wave32 : bool = code . kernel_code_properties & 0x400 == 0x400
# Set rsrc1.priv=1 on gfx11 to workaround cwsr.
self . rsrc1 : int = code . compute_pgm_rsrc1 | ( ( 1 << 20 ) if 110000 < = self . dev . target < 120000 else 0 )
self . rsrc2 : int = code . compute_pgm_rsrc2 | ( lds_size << 15 )
self . rsrc3 : int = image [ rodata_entry + 44 : rodata_entry + 48 ] . cast ( " I " ) [ 0 ] # NOTE: kernel descriptor, not in amd_kernel_code_t struct
self . prog_addr : int = self . lib_gpu . va_addr + rodata_entry + code . kernel_code_entry_byte_offset
if code . kernel_code_entry_byte_offset == 0 : self . prog_addr = self . lib_gpu . va_addr + text_entry
# Some programs use hsa_kernel_dispatch_packet_t to read workgroup sizes during execution.
# The packet is represented as a pointer and set up in SGPRs. Space for the packet is allocated as part of the kernel arguments.
self . enable_dispatch_ptr : int = code . kernel_code_properties & hsa . AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_DISPATCH_PTR
self . enable_private_segment_sgpr : int = code . kernel_code_properties & hsa . AMD_KERNEL_CODE_PROPERTIES_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER
additional_alloc_sz = ctypes . sizeof ( hsa . hsa_kernel_dispatch_packet_t ) if self . enable_dispatch_ptr else 0
if dev . sqtt_enabled : self . libhash : tuple [ int , int ] = struct . unpack ( ' <Q ' , hashlib . md5 ( self . lib ) . digest ( ) [ : 8 ] ) * 2
super ( ) . __init__ ( CLikeArgsState , self . dev , self . name , kernargs_alloc_size = self . kernargs_segment_size + additional_alloc_sz , lib = self . lib ,
base = self . lib_gpu . va_addr )
def __del__ ( self ) :
if hasattr ( self , ' lib_gpu ' ) : self . dev . allocator . free ( self . lib_gpu , self . lib_gpu . size , BufferSpec ( cpu_access = True , nolru = True ) )
class AMDAllocator ( HCQAllocator [ ' AMDDevice ' ] ) :
def _alloc ( self , size : int , options : BufferSpec ) - > HCQBuffer :
return self . dev . dev_iface . alloc ( size , host = options . host , uncached = options . uncached , cpu_access = options . cpu_access )
def _free ( self , opaque , options : BufferSpec ) :
self . dev . synchronize ( )
self . dev . dev_iface . free ( opaque )
def map ( self , buf : HCQBuffer ) : self . dev . dev_iface . map ( buf . _base if buf . _base is not None else buf )
MAP_FIXED , MAP_NORESERVE , MAP_LOCKED = 0x10 , 0x400 , 0 if OSX else 0x2000
@dataclass ( frozen = True )
class ProfileSQTTEvent ( ProfileEvent ) : device : str ; se : int ; blob : bytes ; itrace : bool # noqa: E702
@dataclass
class AMDQueueDesc :
ring : memoryview
read_ptrs : list [ memoryview ]
write_ptrs : list [ memoryview ]
doorbells : list [ memoryview ]
put_value : int = 0
@property
def read_ptr ( self ) : return min ( p [ 0 ] for p in self . read_ptrs )
@classmethod
def multi ( cls , * queues : AMDQueueDesc ) :
assert all_same ( [ ( mv_address ( q . ring ) , q . put_value ) for q in queues ] ) , f " All queues must have the same ring and put_value: { queues } "
return cls ( ring = queues [ 0 ] . ring , put_value = queues [ 0 ] . put_value , doorbells = flatten ( q . doorbells for q in queues ) ,
read_ptrs = flatten ( q . read_ptrs for q in queues ) , write_ptrs = flatten ( q . write_ptrs for q in queues ) )
def signal_doorbell ( self , dev ) :
for write_ptr in self . write_ptrs : write_ptr [ 0 ] = self . put_value
# Ensure all prior writes are visible to the GPU.
if CPUProgram . atomic_lib is not None : CPUProgram . atomic_lib . atomic_thread_fence ( __ATOMIC_SEQ_CST := 5 )
# Flush hdp if queue is in dev mem.
if dev . driverless and getenv ( " AMD_ALLOC_QUEUE_DEV_MEM " , 1 ) : dev . dev_iface . adev . gmc . flush_hdp ( )
for doorbell in self . doorbells : doorbell [ 0 ] = self . put_value
@dataclass ( frozen = True )
class AMDReg ( AMDRegBase ) :
ip : AMDIP
@property
def addr ( self ) : return self . ip . bases [ self . segment ] + self . offset
@dataclass ( frozen = True )
class AMDIP :
name : str
version : tuple [ int , . . . ]
bases : tuple [ int , . . . ]
@functools . cached_property
def module ( self ) : return import_module ( self . name , self . version )
@functools . cached_property
def regs ( self ) : return collect_registers ( self . module , cls = functools . partial ( AMDReg , ip = self ) )
def __getattr__ ( self , name : str ) :
if name in self . regs : return self . regs [ name ]
return getattr ( self . module , name )
class KFDIface :
kfd : HWInterface | None = None
event_page : HCQBuffer | None = None
gpus : list [ HWInterface ] = [ ]
def _is_usable_gpu ( self , gpu_id ) :
with contextlib . suppress ( OSError ) : return int ( gpu_id . read ( ) ) != 0
return False
def __init__ ( self , dev , device_id ) :
self . dev = dev
kfd_topo_path = " /sys/devices/virtual/kfd/kfd/topology/nodes "
# Initialize KFD interface during first run
if KFDIface . kfd is None :
KFDIface . kfd = HWInterface ( " /dev/kfd " , os . O_RDWR )
gpus = [ g for g in HWInterface ( kfd_topo_path ) . listdir ( ) if self . _is_usable_gpu ( HWInterface ( f " { kfd_topo_path } / { g } /gpu_id " ) ) ]
gpus = sorted ( gpus , key = lambda x : int ( x . split ( ' / ' ) [ - 1 ] ) )
visible_devices = [ int ( x ) for x in ( getenv ( ' VISIBLE_DEVICES ' , getenv ( ' HIP_VISIBLE_DEVICES ' , ' ' ) ) ) . split ( ' , ' ) if x . strip ( ) ]
KFDIface . gpus = [ gpus [ x ] for x in visible_devices ] if visible_devices else gpus
if device_id > = len ( KFDIface . gpus ) : raise RuntimeError ( f " No device found for { device_id } . Requesting more devices than the system has? " )
self . gpu_id = int ( HWInterface ( f " { kfd_topo_path } / { KFDIface . gpus [ device_id ] } /gpu_id " ) . read ( ) )
self . props = { l . split ( ) [ 0 ] : int ( l . split ( ) [ 1 ] ) for l in HWInterface ( f " { kfd_topo_path } / { KFDIface . gpus [ device_id ] } /properties " ) . read ( ) . splitlines ( ) }
ip_base = f " /sys/class/drm/renderD { self . props [ ' drm_render_minor ' ] } /device/ip_discovery/die/0 "
id2ip = { am . GC_HWID : am . GC_HWIP , am . SDMA0_HWID : am . SDMA0_HWIP , am . NBIF_HWID : am . NBIF_HWIP }
self . ip_versions = { id2ip [ int ( hwid ) ] : tuple ( int ( HWInterface ( f ' { ip_base } / { hwid } /0/ { part } ' ) . read ( ) ) for part in [ ' major ' , ' minor ' , ' revision ' ] )
for hwid in HWInterface ( ip_base ) . listdir ( ) if hwid . isnumeric ( ) and int ( hwid ) in id2ip }
self . ip_offsets = { id2ip [ int ( hwid ) ] : tuple ( int ( x , 16 ) for x in HWInterface ( f ' { ip_base } / { hwid } /0/base_addr ' ) . read ( ) . splitlines ( ) )
for hwid in HWInterface ( ip_base ) . listdir ( ) if hwid . isnumeric ( ) and int ( hwid ) in id2ip }
self . drm_fd = HWInterface ( f " /dev/dri/renderD { self . props [ ' drm_render_minor ' ] } " , os . O_RDWR )
kfd . AMDKFD_IOC_ACQUIRE_VM ( KFDIface . kfd , drm_fd = self . drm_fd . fd , gpu_id = self . gpu_id )
# Set these for our device.
if KFDIface . event_page is None :
KFDIface . event_page = self . alloc ( 0x8000 , uncached = True )
kfd . AMDKFD_IOC_CREATE_EVENT ( KFDIface . kfd , event_page_offset = KFDIface . event_page . meta . handle )
else : self . map ( KFDIface . event_page )
# Event to wait for queues completion
self . dev . queue_event = kfd . AMDKFD_IOC_CREATE_EVENT ( KFDIface . kfd , event_type = kfd . KFD_IOC_EVENT_SIGNAL , auto_reset = 1 )
self . dev . queue_event_mailbox_ptr = KFDIface . event_page . va_addr + self . dev . queue_event . event_slot_index * 8
self . queue_event_arr = ( kfd . struct_kfd_event_data ) ( event_id = self . dev . queue_event . event_id )
self . queue_event_arr_ptr = ctypes . addressof ( self . queue_event_arr )
# OS events to collect memory and hardware faults
self . mem_fault_event = kfd . AMDKFD_IOC_CREATE_EVENT ( KFDIface . kfd , event_type = kfd . KFD_IOC_EVENT_MEMORY )
self . hw_fault_event = kfd . AMDKFD_IOC_CREATE_EVENT ( KFDIface . kfd , event_type = kfd . KFD_IOC_EVENT_HW_EXCEPTION )
def alloc ( self , size : int , host = False , uncached = False , cpu_access = False ) - > HCQBuffer :
flags = kfd . KFD_IOC_ALLOC_MEM_FLAGS_WRITABLE | kfd . KFD_IOC_ALLOC_MEM_FLAGS_EXECUTABLE | kfd . KFD_IOC_ALLOC_MEM_FLAGS_NO_SUBSTITUTE
if uncached : flags | = kfd . KFD_IOC_ALLOC_MEM_FLAGS_COHERENT | kfd . KFD_IOC_ALLOC_MEM_FLAGS_UNCACHED | kfd . KFD_IOC_ALLOC_MEM_FLAGS_GTT
else : flags | = ( kfd . KFD_IOC_ALLOC_MEM_FLAGS_USERPTR if host else kfd . KFD_IOC_ALLOC_MEM_FLAGS_VRAM )
if cpu_access or host : flags | = kfd . KFD_IOC_ALLOC_MEM_FLAGS_PUBLIC
if flags & kfd . KFD_IOC_ALLOC_MEM_FLAGS_USERPTR :
buf = addr = HWInterface . anon_mmap ( 0 , size , mmap . PROT_READ | mmap . PROT_WRITE , mmap . MAP_SHARED | mmap . MAP_ANONYMOUS , 0 )
else : buf , addr = 0 , HWInterface . anon_mmap ( 0 , size , 0 , mmap . MAP_PRIVATE | mmap . MAP_ANONYMOUS | MAP_NORESERVE , 0 )
assert addr != 0xffffffffffffffff
try : mem = kfd . AMDKFD_IOC_ALLOC_MEMORY_OF_GPU ( self . kfd , va_addr = addr , size = size , base = addr , length = size , gpu_id = self . gpu_id ,
flags = flags , mmap_offset = buf )
except OSError as e :
if e . errno == errno . EINVAL and ( flags & kfd . KFD_IOC_ALLOC_MEM_FLAGS_VRAM ) and cpu_access :
raise MemoryError ( " Cannot allocate host-visible VRAM. Ensure the resizable BAR option is enabled on your system. " ) from e
if e . errno == errno . ENOMEM : raise MemoryError ( " Cannot allocate memory: no memory is available. " ) from e
raise
if not ( flags & kfd . KFD_IOC_ALLOC_MEM_FLAGS_USERPTR ) :
buf = self . drm_fd . mmap ( mem . va_addr , mem . size , mmap . PROT_READ | mmap . PROT_WRITE , mmap . MAP_SHARED | MAP_FIXED , mem . mmap_offset )
assert addr == buf == mem . va_addr
self . map ( hcqbuf := HCQBuffer ( mem . va_addr , mem . size , meta = mem ) )
return hcqbuf
def free ( self , mem ) :
if len ( gpus := getattr ( mem . meta , " mapped_gpu_ids " , [ ] ) ) :
c_gpus = ( ctypes . c_int32 * len ( gpus ) ) ( * gpus )
stm = kfd . AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU ( self . kfd , handle = mem . meta . handle , device_ids_array_ptr = ctypes . addressof ( c_gpus ) , n_devices = len ( gpus ) )
assert stm . n_success == len ( gpus )
if mem . va_addr : HWInterface . munmap ( mem . va_addr , mem . size )
kfd . AMDKFD_IOC_FREE_MEMORY_OF_GPU ( self . kfd , handle = mem . meta . handle )
def map ( self , mem ) :
if self . gpu_id in getattr ( mem . meta , " mapped_gpu_ids " , [ ] ) : return
mem . meta . __setattr__ ( " mapped_gpu_ids " , getattr ( mem . meta , " mapped_gpu_ids " , [ ] ) + [ self . gpu_id ] )
c_gpus = ( ctypes . c_int32 * len ( mem . meta . mapped_gpu_ids ) ) ( * mem . meta . mapped_gpu_ids )
stm = kfd . AMDKFD_IOC_MAP_MEMORY_TO_GPU ( self . kfd , handle = mem . meta . handle , device_ids_array_ptr = ctypes . addressof ( c_gpus ) ,
n_devices = len ( mem . meta . mapped_gpu_ids ) )
assert stm . n_success == len ( mem . meta . mapped_gpu_ids )
def create_queue ( self , queue_type , ring , gart , eop_buffer = None , cwsr_buffer = None , ctl_stack_size = 0 , ctx_save_restore_size = 0 , xcc_id = 0 ) :
queue = kfd . AMDKFD_IOC_CREATE_QUEUE ( KFDIface . kfd , ring_base_address = ring . va_addr , ring_size = ring . size , gpu_id = self . gpu_id ,
queue_type = queue_type , queue_percentage = kfd . KFD_MAX_QUEUE_PERCENTAGE | ( xcc_id << 8 ) , queue_priority = kfd . KFD_MAX_QUEUE_PRIORITY ,
eop_buffer_address = eop_buffer . va_addr if eop_buffer else 0 , eop_buffer_size = eop_buffer . size if eop_buffer else 0 , ctl_stack_size = ctl_stack_size ,
ctx_save_restore_address = cwsr_buffer . va_addr if cwsr_buffer else 0 , ctx_save_restore_size = ctx_save_restore_size ,
write_pointer_address = gart . va_addr , read_pointer_address = gart . va_addr + 8 * ( xcc_id + 1 ) )
if not hasattr ( self , ' doorbells ' ) :
self . doorbells_base = queue . doorbell_offset & ( ~ 0x1fff ) # doorbell is two pages
self . doorbells = cast ( HWInterface , KFDIface . kfd ) . mmap ( 0 , 0x2000 , mmap . PROT_READ | mmap . PROT_WRITE , mmap . MAP_SHARED , self . doorbells_base )
return AMDQueueDesc ( ring = to_mv ( ring . va_addr , ring . size ) . cast ( " I " ) ,
read_ptrs = [ to_mv ( queue . read_pointer_address , 8 ) . cast ( " Q " ) ] , write_ptrs = [ to_mv ( queue . write_pointer_address , 8 ) . cast ( " Q " ) ] ,
doorbells = [ to_mv ( self . doorbells + queue . doorbell_offset - self . doorbells_base , 8 ) . cast ( " Q " ) ] )
def sleep ( self , tm : int ) : kfd . AMDKFD_IOC_WAIT_EVENTS ( KFDIface . kfd , events_ptr = self . queue_event_arr_ptr , num_events = 1 , wait_for_all = 1 , timeout = tm )
def on_device_hang ( self ) :
def _collect_str ( st ) : return ' ' . join ( f ' { k [ 0 ] } = { getattr ( st , k [ 0 ] ) } ' for k in st . _fields_ )
report = [ ]
for evnt in [ self . mem_fault_event , self . hw_fault_event ] :
ev = ( kfd . struct_kfd_event_data ) ( event_id = evnt . event_id )
kfd . AMDKFD_IOC_WAIT_EVENTS ( KFDIface . kfd , events_ptr = ctypes . addressof ( ev ) , num_events = 1 , wait_for_all = 1 )
if evnt == self . mem_fault_event and ev . memory_exception_data . gpu_id :
report + = [ f " MMU fault: 0x { ev . memory_exception_data . va : X } | { _collect_str ( ev . memory_exception_data . failure ) } " ]
if evnt == self . hw_fault_event and ev . hw_exception_data . gpu_id : report + = [ f " HW fault: { _collect_str ( ev . hw_exception_data ) } " ]
raise RuntimeError ( " \n " . join ( report ) )
@dataclass
class AMAllocationMeta : owner : AMDDevice ; mapped_devs : list [ AMDDevice ] ; mapping : AMMapping # noqa: E702
class PCIIface :
supported_devs : list [ int ] = [ 0x744c , 0x7480 ]
vfio : bool = getenv ( " VFIO " , 1 ) and HWInterface . exists ( " /dev/vfio/vfio " )
vfio_fd : HWInterface
gpus : list [ Any ] = [ ]
def __init__ ( self , dev , dev_id ) :
self . dev = dev
if first_dev := len ( PCIIface . gpus ) == 0 :
for pcibus in HWInterface ( " /sys/bus/pci/devices " ) . listdir ( ) :
vendor = int ( HWInterface ( f " /sys/bus/pci/devices/ { pcibus } /vendor " ) . read ( ) , 16 )
device = int ( HWInterface ( f " /sys/bus/pci/devices/ { pcibus } /device " ) . read ( ) , 16 )
if vendor == 0x1002 and device in PCIIface . supported_devs : PCIIface . gpus . append ( pcibus )
# TODO: visible_devices should be handled layer above this?
visible_devices = [ int ( x ) for x in ( getenv ( ' VISIBLE_DEVICES ' , getenv ( ' HIP_VISIBLE_DEVICES ' , ' ' ) ) ) . split ( ' , ' ) if x . strip ( ) ]
PCIIface . gpus = [ PCIIface . gpus [ x ] for x in visible_devices ] if visible_devices else PCIIface . gpus
self . pcibus = PCIIface . gpus [ dev_id ]
# Unbind the device from the kernel driver
if HWInterface . exists ( f " /sys/bus/pci/devices/ { self . pcibus } /driver " ) :
HWInterface ( f " /sys/bus/pci/devices/ { self . pcibus } /driver/unbind " , os . O_WRONLY ) . write ( self . pcibus )
supported_sizes = int ( HWInterface ( f " /sys/bus/pci/devices/ { self . pcibus } /resource0_resize " , os . O_RDONLY ) . read ( ) , 16 )
try : HWInterface ( f " /sys/bus/pci/devices/ { self . pcibus } /resource0_resize " , os . O_RDWR ) . write ( str ( supported_sizes . bit_length ( ) - 1 ) )
except OSError as e : raise RuntimeError ( f " Cannot resize BAR: { e } . Ensure the resizable BAR option is enabled on your system. " ) from e
# Try to init vfio. Use it if success.
if PCIIface . vfio :
try :
if first_dev :
HWInterface ( " /sys/module/vfio/parameters/enable_unsafe_noiommu_mode " , os . O_RDWR ) . write ( " 1 " )
PCIIface . vfio_fd = HWInterface ( " /dev/vfio/vfio " , os . O_RDWR )
vfio . VFIO_CHECK_EXTENSION ( PCIIface . vfio_fd , vfio . VFIO_NOIOMMU_IOMMU )
HWInterface ( f " /sys/bus/pci/devices/ { self . pcibus } /driver_override " , os . O_WRONLY ) . write ( " vfio-pci " )
HWInterface ( " /sys/bus/pci/drivers_probe " , os . O_WRONLY ) . write ( self . pcibus )
iommu_group = HWInterface . readlink ( f " /sys/bus/pci/devices/ { self . pcibus } /iommu_group " ) . split ( ' / ' ) [ - 1 ]
except OSError :
if DEBUG > = 1 : print ( f " am { self . pcibus } : failed to init vfio-pci module (run `sudo modprobe vfio-pci`). " )
PCIIface . vfio = False
# Init vfio for the device
if PCIIface . vfio :
self . vfio_group = HWInterface ( f " /dev/vfio/noiommu- { iommu_group } " , os . O_RDWR )
vfio . VFIO_GROUP_SET_CONTAINER ( self . vfio_group , ctypes . c_int ( PCIIface . vfio_fd . fd ) )
if first_dev : vfio . VFIO_SET_IOMMU ( PCIIface . vfio_fd , vfio . VFIO_NOIOMMU_IOMMU )
self . vfio_dev = HWInterface ( fd = vfio . VFIO_GROUP_GET_DEVICE_FD ( self . vfio_group , ctypes . create_string_buffer ( self . pcibus . encode ( ) ) ) )
self . irq_fd = HWInterface . eventfd ( 0 , 0 )
self . irq_poller = select . poll ( )
self . irq_poller . register ( self . irq_fd . fd , select . POLLIN )
irqs = vfio . struct_vfio_irq_set ( index = vfio . VFIO_PCI_MSI_IRQ_INDEX , flags = vfio . VFIO_IRQ_SET_DATA_EVENTFD | vfio . VFIO_IRQ_SET_ACTION_TRIGGER ,
argsz = ctypes . sizeof ( vfio . struct_vfio_irq_set ) , count = 1 , data = ( ctypes . c_int * 1 ) ( self . irq_fd . fd ) )
vfio . VFIO_DEVICE_SET_IRQS ( self . vfio_dev , irqs )
else : HWInterface ( f " /sys/bus/pci/devices/ { self . pcibus } /enable " , os . O_RDWR ) . write ( " 1 " )
self . pagemap = HWInterface ( " /proc/self/pagemap " , os . O_RDONLY )
self . cfg_fd = HWInterface ( f " /sys/bus/pci/devices/ { self . pcibus } /config " , os . O_RDWR | os . O_SYNC | os . O_CLOEXEC )
self . bar_fds = { bar : HWInterface ( f " /sys/bus/pci/devices/ { self . pcibus } /resource { bar } " , os . O_RDWR | os . O_SYNC | os . O_CLOEXEC ) for bar in [ 0 , 2 , 5 ] }
bar_info = HWInterface ( f " /sys/bus/pci/devices/ { self . pcibus } /resource " , os . O_RDONLY ) . read ( ) . splitlines ( )
self . bar_info = { j : ( int ( start , 16 ) , int ( end , 16 ) , int ( flgs , 16 ) ) for j , ( start , end , flgs ) in enumerate ( l . split ( ) for l in bar_info ) }
self . adev = AMDev ( self . pcibus , self . _map_pci_range ( 0 ) , dbell := self . _map_pci_range ( 2 ) . cast ( ' Q ' ) , self . _map_pci_range ( 5 ) . cast ( ' I ' ) )
self . ip_versions = self . adev . ip_ver
self . ip_offsets = { hwip : tuple ( instances [ 0 ] ) for hwip , instances in self . adev . regs_offset . items ( ) }
self . doorbell_cpu_addr = mv_address ( dbell )
pci_cmd = int . from_bytes ( self . cfg_fd . read ( 2 , binary = True , offset = pci . PCI_COMMAND ) , byteorder = ' little ' ) | pci . PCI_COMMAND_MASTER
self . cfg_fd . write ( pci_cmd . to_bytes ( 2 , byteorder = ' little ' ) , binary = True , offset = pci . PCI_COMMAND )
gfxver = int ( f " { self . adev . ip_ver [ am . GC_HWIP ] [ 0 ] : 02d } { self . adev . ip_ver [ am . GC_HWIP ] [ 1 ] : 02d } { self . adev . ip_ver [ am . GC_HWIP ] [ 2 ] : 02d } " )
array_count = self . adev . gc_info . gc_num_sa_per_se * self . adev . gc_info . gc_num_se
simd_count = 2 * array_count * ( self . adev . gc_info . gc_num_wgp0_per_sa + self . adev . gc_info . gc_num_wgp1_per_sa )
self . props = { ' simd_count ' : 2 * simd_count , ' simd_per_cu ' : 2 , ' array_count ' : array_count , ' gfx_target_version ' : gfxver ,
' max_slots_scratch_cu ' : self . adev . gc_info . gc_max_scratch_slots_per_cu , ' max_waves_per_simd ' : self . adev . gc_info . gc_max_waves_per_simd ,
' simd_arrays_per_engine ' : self . adev . gc_info . gc_num_sa_per_se , ' lds_size_in_kb ' : self . adev . gc_info . gc_lds_size }
def _map_pci_range ( self , bar , off = 0 , addr = 0 , size = None ) :
fd , sz = self . bar_fds [ bar ] , size or ( self . bar_info [ bar ] [ 1 ] - self . bar_info [ bar ] [ 0 ] + 1 )
libc . madvise ( loc := fd . mmap ( addr , sz , mmap . PROT_READ | mmap . PROT_WRITE , mmap . MAP_SHARED | ( MAP_FIXED if addr else 0 ) , off ) , sz , libc . MADV_DONTFORK )
return to_mv ( loc , sz )
def alloc ( self , size : int , host = False , uncached = False , cpu_access = False ) :
if host or ( not getenv ( " AMD_ALLOC_QUEUE_DEV_MEM " , 1 ) and uncached and cpu_access ) : # host or gtt-like memory.
vaddr = self . adev . mm . alloc_vaddr ( size := round_up ( size , mmap . PAGESIZE ) , align = mmap . PAGESIZE )
va = HWInterface . anon_mmap ( vaddr , size , mmap . PROT_READ | mmap . PROT_WRITE , mmap . MAP_SHARED | mmap . MAP_ANONYMOUS | MAP_LOCKED | MAP_FIXED , 0 )
# Read pagemap to get the physical address of each page. The pages are locked.
self . pagemap . seek ( va / / mmap . PAGESIZE * 8 )
paddrs = [ ( ( x & ( ( 1 << 55 ) - 1 ) ) * mmap . PAGESIZE , mmap . PAGESIZE ) for x in array . array ( ' Q ' , self . pagemap . read ( size / / mmap . PAGESIZE * 8 , binary = True ) ) ]
am_mapping = self . adev . mm . map_range ( vaddr , size , paddrs , system = True , snooped = True , uncached = True )
return HCQBuffer ( vaddr , size , meta = AMAllocationMeta ( self . dev , [ self . dev ] , am_mapping ) )
am_mapping = self . adev . mm . valloc ( size := round_up ( size , 4 << 10 ) , uncached = uncached , contigous = cpu_access )
if cpu_access : self . _map_pci_range ( bar = 0 , off = am_mapping . paddrs [ 0 ] [ 0 ] , addr = am_mapping . va_addr , size = am_mapping . size )
return HCQBuffer ( am_mapping . va_addr , size , meta = AMAllocationMeta ( self . dev , [ self . dev ] , am_mapping ) )
def free ( self , mem ) :
for dev in mem . meta . mapped_devs [ 1 : ] : dev . dev_iface . adev . mm . unmap_range ( mem . va_addr , mem . size )
if not mem . meta . mapping . system : self . adev . mm . vfree ( mem . meta . mapping )
def map ( self , mem ) :
# Check if the memory is already mapped on this device
if self . dev in mem . meta . mapped_devs : return
mem . meta . mapped_devs . append ( self . dev )
paddrs = [ ( paddr if mem . meta . mapping . system else ( paddr + mem . meta . owner . dev_iface . bar_info [ 0 ] [ 0 ] ) , size ) for paddr , size in mem . meta . mapping . paddrs ]
self . adev . mm . map_range ( mem . va_addr , mem . size , paddrs , system = True , snooped = mem . meta . mapping . snooped , uncached = mem . meta . mapping . uncached )
def create_queue ( self , queue_type , ring , gart , eop_buffer = None , cwsr_buffer = None , ctl_stack_size = 0 , ctx_save_restore_size = 0 , xcc_id = 0 ) :
if queue_type == kfd . KFD_IOC_QUEUE_TYPE_SDMA :
self . adev . sdma . setup_ring ( ring_addr = ring . va_addr , ring_size = ring . size , rptr_addr = gart . va_addr , wptr_addr = gart . va_addr + 0x10 ,
doorbell = ( doorbell_index := am . AMDGPU_NAVI10_DOORBELL_sDMA_ENGINE0 ) , pipe = 0 , queue = 0 )
else :
self . adev . gfx . setup_ring ( ring_addr = ring . va_addr , ring_size = ring . size , rptr_addr = gart . va_addr , wptr_addr = gart . va_addr + 0x10 ,
eop_addr = eop_buffer . va_addr , eop_size = eop_buffer . size , doorbell = ( doorbell_index := am . AMDGPU_NAVI10_DOORBELL_MEC_RING0 ) , pipe = 0 , queue = 0 )
return AMDQueueDesc ( ring = to_mv ( ring . va_addr , ring . size ) . cast ( " I " ) , doorbells = [ to_mv ( self . doorbell_cpu_addr + doorbell_index * 8 , 8 ) . cast ( " Q " ) ] ,
read_ptrs = [ to_mv ( gart . va_addr , 8 ) . cast ( " Q " ) ] , write_ptrs = [ to_mv ( gart . va_addr + 0x10 , 8 ) . cast ( " Q " ) ] )
def sleep ( self , timeout ) :
if PCIIface . vfio and ( events_cnt := len ( self . irq_poller . poll ( timeout ) ) ) :
self . irq_fd . read ( 8 * events_cnt )
self . adev . ih . interrupt_handler ( )
def on_device_hang ( self ) :
for d in self . dev . devices : d . dev_iface . adev . gmc . on_interrupt ( )
raise RuntimeError ( " Device hang detected " )
def device_fini ( self ) : self . adev . fini ( )
class AMDDevice ( HCQCompiled ) :
devices : ClassVar [ list [ HCQCompiled ] ] = [ ]
signal_pages : ClassVar [ list [ Any ] ] = [ ]
signal_pool : ClassVar [ list [ int ] ] = [ ]
driverless : bool = not HWInterface . exists ( ' /sys/module/amdgpu ' ) or bool ( getenv ( " AMD_DRIVERLESS " , 0 ) )
def __init__ ( self , device : str = " " ) :
self . device_id = int ( device . split ( " : " ) [ 1 ] ) if " : " in device else 0
self . dev_iface = PCIIface ( self , self . device_id ) if AMDDevice . driverless else KFDIface ( self , self . device_id )
self . target = int ( self . dev_iface . props [ ' gfx_target_version ' ] )
self . gfxver = self . target / / 10000
self . arch = " gfx %d %x %x " % ( self . target / / 10000 , ( self . target / / 100 ) % 100 , self . target % 100 )
if self . target < 90402 or self . target > = 120000 : raise RuntimeError ( f " Unsupported arch: { self . arch } " )
if DEBUG > = 1 : print ( f " AMDDevice: opening { self . device_id } with target { self . target } arch { self . arch } " )
self . max_cu_id = self . dev_iface . props [ ' simd_count ' ] / / self . dev_iface . props [ ' simd_per_cu ' ] / / self . dev_iface . props . get ( ' num_xcc ' , 1 ) - 1
self . max_wave_id = ( self . dev_iface . props [ ' max_waves_per_simd ' ] * self . dev_iface . props [ ' simd_per_cu ' ] - 1 ) if self . target > = 100100 else \
( min ( ( self . max_cu_id + 1 ) * 40 , self . dev_iface . props [ ' array_count ' ] / / self . dev_iface . props [ ' simd_arrays_per_engine ' ] * 512 ) - 1 )
self . xccs = self . dev_iface . props . get ( ' num_xcc ' , 1 ) if getenv ( " XCCS " , 1 ) else 1
self . has_scratch_base_registers = self . target > = 110000 or self . target == 90402 # this is what llvm refers to as "architected flat scratch"
# https://gitlab.freedesktop.org/agd5f/linux/-/blob/a1fc9f584c4aaf8bc1ebfa459fc57a3f26a290d8/drivers/gpu/drm/amd/amdkfd/kfd_queue.c#L391
sgrp_size_per_cu , lds_size_per_cu , hwreg_size_per_cu = 0x4000 , 0x10000 , 0x1000
vgpr_size_per_cu = 0x60000 if self . target in { 110000 , 110001 , 120000 , 120001 } else \
0x80000 if ( self . target / / 100 ) * 100 == 90400 or self . target in { 90008 , 90010 } else 0x40000
wg_data_size = round_up ( ( vgpr_size_per_cu + sgrp_size_per_cu + lds_size_per_cu + hwreg_size_per_cu ) * ( self . max_cu_id + 1 ) , mmap . PAGESIZE )
ctl_stack_size = round_up ( 12 * ( self . max_cu_id + 1 ) * ( self . max_wave_id + 1 ) + 8 + 40 , mmap . PAGESIZE ) if self . target > = 100100 else \
round_up ( ( self . max_wave_id + 1 ) * 8 + 8 + 40 , mmap . PAGESIZE )
debug_memory_size = round_up ( ( self . max_cu_id + 1 if self . target > = 100100 else 1 ) * ( self . max_wave_id + 1 ) * 32 , 64 )
if self . gfxver == 10 : ctl_stack_size = min ( ctl_stack_size , 0x7000 )
self . soc = importlib . import_module ( f " tinygrad.runtime.autogen.am. { ( { 9 : ' vega10 ' , 10 : ' navi10 ' , 11 : ' soc21 ' , 12 : ' soc24 ' } [ self . gfxver ] ) } " )
self . pm4 = importlib . import_module ( f " tinygrad.runtime.autogen.am.pm4_ { ' nv ' if self . gfxver > = 10 else ' soc15 ' } " )
self . sdma = import_module ( ' sdma ' , min ( self . dev_iface . ip_versions [ am . SDMA0_HWIP ] , ( 6 , 0 , 0 ) ) )
self . gc = AMDIP ( ' gc ' , self . dev_iface . ip_versions [ am . GC_HWIP ] , self . dev_iface . ip_offsets [ am . GC_HWIP ] )
pad = ( 0 , ) if self . gfxver == 9 else ( ) # ?!?!?!?!??!?!?!
self . nbio = AMDIP ( ' nbio ' if self . gfxver < 12 else ' nbif ' , self . dev_iface . ip_versions [ am . NBIF_HWIP ] , pad + self . dev_iface . ip_offsets [ am . NBIF_HWIP ] )
self . compute_queue = self . create_queue ( kfd . KFD_IOC_QUEUE_TYPE_COMPUTE , 0x800000 , ctx_save_restore_size = wg_data_size + ctl_stack_size ,
eop_buffer_size = 0x1000 , ctl_stack_size = ctl_stack_size , debug_memory_size = debug_memory_size )
max_copy_size = 0x40000000 if self . dev_iface . ip_versions [ am . SDMA0_HWIP ] [ 0 ] > = 5 else 0x400000
self . sdma_queue = self . create_queue ( kfd . KFD_IOC_QUEUE_TYPE_SDMA , 0x800000 )
super ( ) . __init__ ( device , AMDAllocator ( self ) , AMDLLVMRenderer ( ) if getenv ( " AMD_LLVM " , 0 ) else AMDRenderer ( self . arch ) ,
AMDLLVMCompiler ( self . arch ) if getenv ( " AMD_LLVM " , 0 ) else HIPCompiler ( self . arch ) , functools . partial ( AMDProgram , self ) ,
AMDSignal , functools . partial ( AMDComputeQueue , self ) , functools . partial ( AMDCopyQueue , self , max_copy_size = max_copy_size ) )
# Scratch setup
self . max_private_segment_size = 0
self . _ensure_has_local_memory ( 128 ) # set default scratch size to 128 bytes per thread
# XCC setup
self . xcc_sync : tuple [ AMDSignal , AMDSignal ] | None = ( AMDSignal ( ) , AMDSignal ( ) ) if self . xccs > 1 else None
if self . xccs > 1 : AMDComputeQueue ( self ) . xcc_config ( ) . submit ( self )
# SQTT is disabled by default because of runtime overhead and big file sizes (~200mb to Tensor.full() two 4096x4096 tensors and matmul them)
self . sqtt_enabled = PROFILE and bool ( getenv ( " SQTT " , 0 ) )
if self . sqtt_enabled :
if self . arch != ' gfx1100 ' : raise RuntimeError ( ' SQ Thread Tracing is only supported on 7900XTX ' )
if not self . driverless and ( ppfeaturemask := int ( HWInterface ( ' /sys/module/amdgpu/parameters/ppfeaturemask ' , os . O_RDONLY ) . read ( ) , 16 ) ) & 0x8000 :
raise RuntimeError ( " SQTT can ' t be enabled because of hardware bug, to workaround either use driverless or add "
f " ppfeaturemask= { ( ppfeaturemask & ~ 0x8000 ) : #x } (current { ppfeaturemask =: #x } & ~PP_GFXOFF_MASK) to amdgpu module parameters \n "
" For more information read https://github.com/tinygrad/tinygrad/blob/master/extra/sqtt/README.md " )
SQTT_BUFFER_SIZE = getenv ( " SQTT_BUFFER_SIZE " , 256 ) # in mb, per shader engine
SQTT_NUM = self . dev_iface . props [ ' array_count ' ] / / self . dev_iface . props [ ' simd_arrays_per_engine ' ]
self . sqtt_buffers = [ self . allocator . alloc ( SQTT_BUFFER_SIZE * 1024 * 1024 , BufferSpec ( cpu_access = True , nolru = True ) ) for _ in range ( SQTT_NUM ) ]
self . sqtt_itrace_se_mask = getenv ( " SQTT_ITRACE_SE_MASK " , 2 ) # -1 enable all, 0 disable all, >0 bitmask for where to enable instruction tracing
self . cmd_id = 0
AMDComputeQueue ( self ) . start_trace ( self . sqtt_buffers , self . sqtt_itrace_se_mask ) . submit ( self )
def create_queue ( self , queue_type , ring_size , ctx_save_restore_size = 0 , eop_buffer_size = 0 , ctl_stack_size = 0 , debug_memory_size = 0 ) :
ring = self . dev_iface . alloc ( ring_size , uncached = True , cpu_access = True )
gart = self . dev_iface . alloc ( 0x1000 , uncached = True , cpu_access = True )
eop_buffer = self . dev_iface . alloc ( eop_buffer_size ) if eop_buffer_size else None
cwsr_buffer_size = round_up ( ( ctx_save_restore_size + debug_memory_size ) * self . dev_iface . props . get ( ' num_xcc ' , 1 ) , mmap . PAGESIZE )
return AMDQueueDesc . multi ( * ( self . dev_iface . create_queue ( queue_type , ring , gart , eop_buffer = eop_buffer , xcc_id = xcc_id ,
ctx_save_restore_size = ctx_save_restore_size , ctl_stack_size = ctl_stack_size ,
cwsr_buffer = ( self . dev_iface . alloc ( cwsr_buffer_size ) if ctx_save_restore_size else None ) )
for xcc_id in range ( self . xccs if queue_type == kfd . KFD_IOC_QUEUE_TYPE_COMPUTE else 1 ) ) )
def _ensure_has_local_memory ( self , required ) :
if self . max_private_segment_size > = required : return
# <gfx103 requires alignment of 1024, >=gfx11 requires 256
wave_scratch_len = round_up ( ( ( self . max_wave_id + 1 ) * required ) , 256 if self . target > = 110000 else 1024 )
scratch_size = ( self . max_cu_id + 1 ) * self . dev_iface . props [ ' max_slots_scratch_cu ' ] * wave_scratch_len # per xcc
self . scratch , ok = self . _realloc ( getattr ( self , ' scratch ' , None ) , scratch_size * self . xccs )
if ok :
engines = self . dev_iface . props [ ' array_count ' ] / / self . dev_iface . props [ ' simd_arrays_per_engine ' ]
waves = wave_scratch_len / / ( 256 if self . target > = 110000 else 1024 )
# >=gfx11 wavesize is per SE
wavesize = scratch_size / / ( ( wave_scratch_len * engines ) if self . target > = 110000 else wave_scratch_len )
self . tmpring_size = waves << 12 | wavesize
self . max_private_segment_size = required
def invalidate_caches ( self ) :
AMDComputeQueue ( self ) . memory_barrier ( ) . signal ( self . timeline_signal , self . next_timeline ( ) ) . submit ( self )
self . synchronize ( )
def on_device_hang ( self ) : self . dev_iface . on_device_hang ( )
def _at_profile_finalize ( self ) :
if self . sqtt_enabled :
wptrs_buf = self . allocator . alloc ( round_up ( len ( self . sqtt_buffers ) , 0x1000 ) , BufferSpec ( cpu_access = True , nolru = True ) )
wptrs = to_mv ( wptrs_buf . va_addr , wptrs_buf . size )
AMDComputeQueue ( self ) . stop_trace ( len ( self . sqtt_buffers ) , wptrs_buf ) . signal ( self . timeline_signal , self . next_timeline ( ) ) . submit ( self )
self . synchronize ( )
if DEBUG > = 2 : print ( ' Saving SQTT in profile... ' )
for i , buf0 in enumerate ( self . sqtt_buffers ) :
wptr = ( ( struct . unpack ( ' <I ' , wptrs [ i * 4 : i * 4 + 4 ] ) [ 0 ] & 0x1FFFFFFF ) - ( ( buf0 . va_addr / / 32 ) & 0x1FFFFFFF ) ) * 32
if DEBUG > = 2 : print ( f ' Se { i } blob size { wptr : #x } ' )
assert wptr > = 0 and wptr < = buf0 . size , f " { wptr } > { buf0 . size } , should never happen "
# When sqtt buffer overflows, wptr stops at the last dword
if wptr > = buf0 . size - 32 : print ( f " WARNING: SQTT BUFFER IS FULL (SE { i } )! INCREASE SQTT BUFFER SIZE WITH SQTT_BUFFER_SIZE=X (in MB) " )
self . allocator . _copyout ( sqtt_buf := memoryview ( bytearray ( wptr ) ) , buf0 )
Compiled . profile_events + = [ ProfileSQTTEvent ( self . device , i , bytes ( sqtt_buf ) , bool ( ( self . sqtt_itrace_se_mask >> i ) & 0b1 ) ) ]
super ( ) . _at_profile_finalize ( )
def finalize ( self ) :
self . synchronize ( )
if hasattr ( self . dev_iface , ' device_fini ' ) : self . dev_iface . device_fini ( )