# include "selfdrive/modeld/thneed/thneed.h"
# include <dlfcn.h>
# include <sys/mman.h>
# include <cassert>
# include <cerrno>
# include <cstring>
# include <map>
# include <string>
# include "common/clutil.h"
# include "common/timing.h"
Thneed * g_thneed = NULL ;
int g_fd = - 1 ;
void hexdump ( uint8_t * d , int len ) {
assert ( ( len % 4 ) = = 0 ) ;
printf ( " dumping %p len 0x%x \n " , d , len ) ;
for ( int i = 0 ; i < len / 4 ; i + + ) {
if ( i ! = 0 & & ( i % 0x10 ) = = 0 ) printf ( " \n " ) ;
printf ( " %8x " , d [ i ] ) ;
}
printf ( " \n " ) ;
}
// *********** ioctl interceptor ***********
extern " C " {
int ( * my_ioctl ) ( int filedes , unsigned long request , void * argp ) = NULL ;
# undef ioctl
int ioctl ( int filedes , unsigned long request , void * argp ) {
request & = 0xFFFFFFFF ; // needed on QCOM2
if ( my_ioctl = = NULL ) my_ioctl = reinterpret_cast < decltype ( my_ioctl ) > ( dlsym ( RTLD_NEXT , " ioctl " ) ) ;
Thneed * thneed = g_thneed ;
// save the fd
if ( request = = IOCTL_KGSL_GPUOBJ_ALLOC ) g_fd = filedes ;
// note that this runs always, even without a thneed object
if ( request = = IOCTL_KGSL_DRAWCTXT_CREATE ) {
struct kgsl_drawctxt_create * create = ( struct kgsl_drawctxt_create * ) argp ;
create - > flags & = ~ KGSL_CONTEXT_PRIORITY_MASK ;
create - > flags | = 1 < < KGSL_CONTEXT_PRIORITY_SHIFT ; // priority from 1-15, 1 is max priority
printf ( " IOCTL_KGSL_DRAWCTXT_CREATE: creating context with flags 0x%x \n " , create - > flags ) ;
}
if ( thneed ! = NULL ) {
if ( request = = IOCTL_KGSL_GPU_COMMAND ) {
struct kgsl_gpu_command * cmd = ( struct kgsl_gpu_command * ) argp ;
if ( thneed - > record ) {
thneed - > timestamp = cmd - > timestamp ;
thneed - > context_id = cmd - > context_id ;
thneed - > cmds . push_back ( unique_ptr < CachedCommand > ( new CachedCommand ( thneed , cmd ) ) ) ;
}
if ( thneed - > debug > = 1 ) {
printf ( " IOCTL_KGSL_GPU_COMMAND(%2zu): flags: 0x%lx context_id: %u timestamp: %u numcmds: %d numobjs: %d \n " ,
thneed - > cmds . size ( ) ,
cmd - > flags ,
cmd - > context_id , cmd - > timestamp , cmd - > numcmds , cmd - > numobjs ) ;
}
} else if ( request = = IOCTL_KGSL_GPUOBJ_SYNC ) {
struct kgsl_gpuobj_sync * cmd = ( struct kgsl_gpuobj_sync * ) argp ;
struct kgsl_gpuobj_sync_obj * objs = ( struct kgsl_gpuobj_sync_obj * ) ( cmd - > objs ) ;
if ( thneed - > debug > = 2 ) {
printf ( " IOCTL_KGSL_GPUOBJ_SYNC count:%d " , cmd - > count ) ;
for ( int i = 0 ; i < cmd - > count ; i + + ) {
printf ( " -- offset:0x%lx len:0x%lx id:%d op:%d " , objs [ i ] . offset , objs [ i ] . length , objs [ i ] . id , objs [ i ] . op ) ;
}
printf ( " \n " ) ;
}
if ( thneed - > record ) {
thneed - > cmds . push_back ( unique_ptr < CachedSync > ( new
CachedSync ( thneed , string ( ( char * ) objs , sizeof ( struct kgsl_gpuobj_sync_obj ) * cmd - > count ) ) ) ) ;
}
} else if ( request = = IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID ) {
struct kgsl_device_waittimestamp_ctxtid * cmd = ( struct kgsl_device_waittimestamp_ctxtid * ) argp ;
if ( thneed - > debug > = 1 ) {
printf ( " IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID: context_id: %d timestamp: %d timeout: %d \n " ,
cmd - > context_id , cmd - > timestamp , cmd - > timeout ) ;
}
} else if ( request = = IOCTL_KGSL_SETPROPERTY ) {
if ( thneed - > debug > = 1 ) {
struct kgsl_device_getproperty * prop = ( struct kgsl_device_getproperty * ) argp ;
printf ( " IOCTL_KGSL_SETPROPERTY: 0x%x sizebytes:%zu \n " , prop - > type , prop - > sizebytes ) ;
if ( thneed - > debug > = 2 ) {
hexdump ( ( uint8_t * ) prop - > value , prop - > sizebytes ) ;
if ( prop - > type = = KGSL_PROP_PWR_CONSTRAINT ) {
struct kgsl_device_constraint * constraint = ( struct kgsl_device_constraint * ) prop - > value ;
hexdump ( ( uint8_t * ) constraint - > data , constraint - > size ) ;
}
}
}
} else if ( request = = IOCTL_KGSL_DRAWCTXT_CREATE | | request = = IOCTL_KGSL_DRAWCTXT_DESTROY ) {
// this happens
} else if ( request = = IOCTL_KGSL_GPUOBJ_ALLOC | | request = = IOCTL_KGSL_GPUOBJ_FREE ) {
// this happens
} else {
if ( thneed - > debug > = 1 ) {
printf ( " other ioctl %lx \n " , request ) ;
}
}
}
int ret = my_ioctl ( filedes , request , argp ) ;
if ( ret ! = 0 ) printf ( " ioctl returned %d with errno %d \n " , ret , errno ) ;
return ret ;
}
}
// *********** GPUMalloc ***********
GPUMalloc : : GPUMalloc ( int size , int fd ) {
struct kgsl_gpuobj_alloc alloc ;
memset ( & alloc , 0 , sizeof ( alloc ) ) ;
alloc . size = size ;
alloc . flags = 0x10000a00 ;
ioctl ( fd , IOCTL_KGSL_GPUOBJ_ALLOC , & alloc ) ;
void * addr = mmap64 ( NULL , alloc . mmapsize , 0x3 , 0x1 , fd , alloc . id * 0x1000 ) ;
assert ( addr ! = MAP_FAILED ) ;
base = ( uint64_t ) addr ;
remaining = size ;
}
GPUMalloc : : ~ GPUMalloc ( ) {
// TODO: free the GPU malloced area
}
void * GPUMalloc : : alloc ( int size ) {
void * ret = ( void * ) base ;
size = ( size + 0xff ) & ( ~ 0xFF ) ;
assert ( size < = remaining ) ;
remaining - = size ;
base + = size ;
return ret ;
}
// *********** CachedSync, at the ioctl layer ***********
void CachedSync : : exec ( ) {
struct kgsl_gpuobj_sync cmd ;
cmd . objs = ( uint64_t ) data . data ( ) ;
cmd . obj_len = data . length ( ) ;
cmd . count = data . length ( ) / sizeof ( struct kgsl_gpuobj_sync_obj ) ;
int ret = ioctl ( thneed - > fd , IOCTL_KGSL_GPUOBJ_SYNC , & cmd ) ;
assert ( ret = = 0 ) ;
}
// *********** CachedCommand, at the ioctl layer ***********
CachedCommand : : CachedCommand ( Thneed * lthneed , struct kgsl_gpu_command * cmd ) {
thneed = lthneed ;
assert ( cmd - > numsyncs = = 0 ) ;
memcpy ( & cache , cmd , sizeof ( cache ) ) ;
if ( cmd - > numcmds > 0 ) {
cmds = make_unique < struct kgsl_command_object [ ] > ( cmd - > numcmds ) ;
memcpy ( cmds . get ( ) , ( void * ) cmd - > cmdlist , sizeof ( struct kgsl_command_object ) * cmd - > numcmds ) ;
cache . cmdlist = ( uint64_t ) cmds . get ( ) ;
for ( int i = 0 ; i < cmd - > numcmds ; i + + ) {
void * nn = thneed - > ram - > alloc ( cmds [ i ] . size ) ;
memcpy ( nn , ( void * ) cmds [ i ] . gpuaddr , cmds [ i ] . size ) ;
cmds [ i ] . gpuaddr = ( uint64_t ) nn ;
}
}
if ( cmd - > numobjs > 0 ) {
objs = make_unique < struct kgsl_command_object [ ] > ( cmd - > numobjs ) ;
memcpy ( objs . get ( ) , ( void * ) cmd - > objlist , sizeof ( struct kgsl_command_object ) * cmd - > numobjs ) ;
cache . objlist = ( uint64_t ) objs . get ( ) ;
for ( int i = 0 ; i < cmd - > numobjs ; i + + ) {
void * nn = thneed - > ram - > alloc ( objs [ i ] . size ) ;
memset ( nn , 0 , objs [ i ] . size ) ;
objs [ i ] . gpuaddr = ( uint64_t ) nn ;
}
}
kq = thneed - > ckq ;
thneed - > ckq . clear ( ) ;
}
void CachedCommand : : exec ( ) {
cache . timestamp = + + thneed - > timestamp ;
int ret = ioctl ( thneed - > fd , IOCTL_KGSL_GPU_COMMAND , & cache ) ;
if ( thneed - > debug > = 1 ) printf ( " CachedCommand::exec got %d \n " , ret ) ;
if ( thneed - > debug > = 2 ) {
for ( auto & it : kq ) {
it - > debug_print ( false ) ;
}
}
assert ( ret = = 0 ) ;
}
// *********** Thneed ***********
Thneed : : Thneed ( bool do_clinit , cl_context _context ) {
// TODO: QCOM2 actually requires a different context
//context = _context;
if ( do_clinit ) clinit ( ) ;
assert ( g_fd ! = - 1 ) ;
fd = g_fd ;
ram = make_unique < GPUMalloc > ( 0x80000 , fd ) ;
timestamp = - 1 ;
g_thneed = this ;
char * thneed_debug_env = getenv ( " THNEED_DEBUG " ) ;
debug = ( thneed_debug_env ! = NULL ) ? atoi ( thneed_debug_env ) : 0 ;
}
void Thneed : : stop ( ) {
find_inputs_outputs ( ) ;
printf ( " Thneed::stop: recorded %lu commands \n " , cmds . size ( ) ) ;
record = false ;
}
void Thneed : : find_inputs_outputs ( ) {
cl_int err ;
if ( inputs . size ( ) > 0 ) return ;
// save the global inputs/outputs
for ( auto & k : kq ) {
for ( int i = 0 ; i < k - > num_args ; i + + ) {
if ( k - > name = = " zero_pad_image_float " & & k - > arg_names [ i ] = = " input " ) {
cl_mem aa = * ( cl_mem * ) ( k - > args [ i ] . data ( ) ) ;
input_clmem . push_back ( aa ) ;
size_t sz ;
clGetMemObjectInfo ( aa , CL_MEM_SIZE , sizeof ( sz ) , & sz , NULL ) ;
input_sizes . push_back ( sz ) ;
void * ret = clEnqueueMapBuffer ( command_queue , aa , CL_TRUE , CL_MAP_WRITE , 0 , sz , 0 , NULL , NULL , & err ) ;
assert ( err = = CL_SUCCESS ) ;
inputs . push_back ( ret ) ;
}
if ( k - > name = = " image2d_to_buffer_float " & & k - > arg_names [ i ] = = " output " ) {
output = * ( cl_mem * ) ( k - > args [ i ] . data ( ) ) ;
}
}
}
}
void Thneed : : wait ( ) {
struct kgsl_device_waittimestamp_ctxtid wait ;
wait . context_id = context_id ;
wait . timestamp = timestamp ;
wait . timeout = - 1 ;
uint64_t tb = nanos_since_boot ( ) ;
int wret = ioctl ( fd , IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID , & wait ) ;
uint64_t te = nanos_since_boot ( ) ;
if ( debug > = 1 ) printf ( " wait %d after %lu us \n " , wret , ( te - tb ) / 1000 ) ;
}
void Thneed : : execute ( float * * finputs , float * foutput , bool slow ) {
uint64_t tb , te ;
if ( debug > = 1 ) tb = nanos_since_boot ( ) ;
// ****** copy inputs
copy_inputs ( finputs , true ) ;
// ****** set power constraint
int ret ;
struct kgsl_device_constraint_pwrlevel pwrlevel ;
pwrlevel . level = KGSL_CONSTRAINT_PWR_MAX ;
struct kgsl_device_constraint constraint ;
constraint . type = KGSL_CONSTRAINT_PWRLEVEL ;
constraint . context_id = context_id ;
constraint . data = ( void * ) & pwrlevel ;
constraint . size = sizeof ( pwrlevel ) ;
struct kgsl_device_getproperty prop ;
prop . type = KGSL_PROP_PWR_CONSTRAINT ;
prop . value = ( void * ) & constraint ;
prop . sizebytes = sizeof ( constraint ) ;
ret = ioctl ( fd , IOCTL_KGSL_SETPROPERTY , & prop ) ;
assert ( ret = = 0 ) ;
// ****** run commands
int i = 0 ;
for ( auto & it : cmds ) {
+ + i ;
if ( debug > = 1 ) printf ( " run %2d @ %7lu us: " , i , ( nanos_since_boot ( ) - tb ) / 1000 ) ;
it - > exec ( ) ;
if ( ( i = = cmds . size ( ) ) | | slow ) wait ( ) ;
}
// ****** copy outputs
copy_output ( foutput ) ;
// ****** unset power constraint
constraint . type = KGSL_CONSTRAINT_NONE ;
constraint . data = NULL ;
constraint . size = 0 ;
ret = ioctl ( fd , IOCTL_KGSL_SETPROPERTY , & prop ) ;
assert ( ret = = 0 ) ;
if ( debug > = 1 ) {
te = nanos_since_boot ( ) ;
printf ( " model exec in %lu us \n " , ( te - tb ) / 1000 ) ;
}
}
// *********** OpenCL interceptor ***********
cl_int thneed_clEnqueueNDRangeKernel ( cl_command_queue command_queue ,
cl_kernel kernel ,
cl_uint work_dim ,
const size_t * global_work_offset ,
const size_t * global_work_size ,
const size_t * local_work_size ,
cl_uint num_events_in_wait_list ,
const cl_event * event_wait_list ,
cl_event * event ) {
Thneed * thneed = g_thneed ;
// SNPE doesn't use these
assert ( num_events_in_wait_list = = 0 ) ;
assert ( global_work_offset = = NULL ) ;
assert ( event_wait_list = = NULL ) ;
cl_int ret = 0 ;
if ( thneed ! = NULL & & thneed - > record ) {
if ( thneed - > context = = NULL ) {
thneed - > command_queue = command_queue ;
clGetKernelInfo ( kernel , CL_KERNEL_CONTEXT , sizeof ( thneed - > context ) , & thneed - > context , NULL ) ;
clGetContextInfo ( thneed - > context , CL_CONTEXT_DEVICES , sizeof ( thneed - > device_id ) , & thneed - > device_id , NULL ) ;
}
// if we are recording, we don't actually enqueue the kernel
thneed - > kq . push_back ( unique_ptr < CLQueuedKernel > ( new CLQueuedKernel ( thneed , kernel , work_dim , global_work_size , local_work_size ) ) ) ;
* event = NULL ;
} else {
ret = clEnqueueNDRangeKernel ( command_queue , kernel , work_dim ,
global_work_offset , global_work_size , local_work_size ,
num_events_in_wait_list , event_wait_list , event ) ;
}
return ret ;
}
cl_int thneed_clFinish ( cl_command_queue command_queue ) {
Thneed * thneed = g_thneed ;
if ( thneed ! = NULL & & thneed - > record ) {
if ( thneed - > run_optimizer ) thneed - > optimize ( ) ;
return thneed - > clexec ( ) ;
} else {
return clFinish ( command_queue ) ;
}
}
void * dlsym ( void * handle , const char * symbol ) {
# ifdef QCOM2
void * ( * my_dlsym ) ( void * handle , const char * symbol ) = ( void * ( * ) ( void * handle , const char * symbol ) ) ( ( uintptr_t ) dlopen + DLSYM_OFFSET ) ;
# else
# error "Unsupported platform for thneed"
# endif
if ( memcmp ( " REAL_ " , symbol , 5 ) = = 0 ) {
return my_dlsym ( handle , symbol + 5 ) ;
} else if ( strcmp ( " clFinish " , symbol ) = = 0 ) {
return ( void * ) thneed_clFinish ;
} else if ( strcmp ( " clEnqueueNDRangeKernel " , symbol ) = = 0 ) {
return ( void * ) thneed_clEnqueueNDRangeKernel ;
} else if ( strcmp ( " clSetKernelArg " , symbol ) = = 0 ) {
return ( void * ) thneed_clSetKernelArg ;
} else if ( strcmp ( " clCreateProgramWithSource " , symbol ) = = 0 ) {
return ( void * ) thneed_clCreateProgramWithSource ;
} else {
return my_dlsym ( handle , symbol ) ;
}
}