# include "thneed.h"
# include <cassert>
# include <sys/mman.h>
# include <dlfcn.h>
# include <map>
# include <string>
# include <errno.h>
Thneed * g_thneed = NULL ;
int g_fd = - 1 ;
map < pair < cl_kernel , int > , string > g_args ;
static inline uint64_t nanos_since_boot ( ) {
struct timespec t ;
clock_gettime ( CLOCK_BOOTTIME , & t ) ;
return t . tv_sec * 1000000000ULL + t . tv_nsec ;
}
void hexdump ( uint32_t * d , int len ) {
assert ( ( len % 4 ) = = 0 ) ;
printf ( " dumping %p len 0x%x \n " , d , len ) ;
for ( int i = 0 ; i < len / 4 ; i + + ) {
if ( i ! = 0 & & ( i % 0x10 ) = = 0 ) printf ( " \n " ) ;
printf ( " %8x " , d [ i ] ) ;
}
printf ( " \n " ) ;
}
extern " C " {
int ( * my_ioctl ) ( int filedes , unsigned long request , void * argp ) = NULL ;
# undef ioctl
int ioctl ( int filedes , unsigned long request , void * argp ) {
if ( my_ioctl = = NULL ) my_ioctl = reinterpret_cast < decltype ( my_ioctl ) > ( dlsym ( RTLD_NEXT , " ioctl " ) ) ;
Thneed * thneed = g_thneed ;
// save the fd
if ( request = = IOCTL_KGSL_GPUOBJ_ALLOC ) g_fd = filedes ;
if ( thneed ! = NULL ) {
if ( request = = IOCTL_KGSL_GPU_COMMAND ) {
struct kgsl_gpu_command * cmd = ( struct kgsl_gpu_command * ) argp ;
if ( thneed - > record & 1 ) {
thneed - > timestamp = cmd - > timestamp ;
thneed - > context_id = cmd - > context_id ;
thneed - > cmds . push_back ( unique_ptr < CachedCommand > ( new CachedCommand ( thneed , cmd ) ) ) ;
}
if ( thneed - > record & 2 ) {
printf ( " IOCTL_KGSL_GPU_COMMAND(%2zu): flags: 0x%lx context_id: %u timestamp: %u \n " ,
thneed - > cmds . size ( ) ,
cmd - > flags ,
cmd - > context_id , cmd - > timestamp ) ;
}
} else if ( request = = IOCTL_KGSL_GPUOBJ_SYNC ) {
struct kgsl_gpuobj_sync * cmd = ( struct kgsl_gpuobj_sync * ) argp ;
struct kgsl_gpuobj_sync_obj * objs = ( struct kgsl_gpuobj_sync_obj * ) ( cmd - > objs ) ;
if ( thneed - > record & 2 ) {
printf ( " IOCTL_KGSL_GPUOBJ_SYNC count:%d " , cmd - > count ) ;
for ( int i = 0 ; i < cmd - > count ; i + + ) {
printf ( " -- offset:0x%lx len:0x%lx id:%d op:%d " , objs [ i ] . offset , objs [ i ] . length , objs [ i ] . id , objs [ i ] . op ) ;
}
printf ( " \n " ) ;
}
if ( thneed - > record & 1 ) {
thneed - > syncobjs . push_back ( string ( ( char * ) objs , sizeof ( struct kgsl_gpuobj_sync_obj ) * cmd - > count ) ) ;
}
} else if ( request = = IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID ) {
struct kgsl_device_waittimestamp_ctxtid * cmd = ( struct kgsl_device_waittimestamp_ctxtid * ) argp ;
if ( thneed - > record & 2 ) {
printf ( " IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID: context_id: %d timestamp: %d timeout: %d \n " ,
cmd - > context_id , cmd - > timestamp , cmd - > timeout ) ;
}
} else if ( request = = IOCTL_KGSL_SETPROPERTY ) {
if ( thneed - > record & 2 ) {
struct kgsl_device_getproperty * prop = ( struct kgsl_device_getproperty * ) argp ;
printf ( " IOCTL_KGSL_SETPROPERTY: 0x%x sizebytes:%zu \n " , prop - > type , prop - > sizebytes ) ;
if ( thneed - > record & 4 ) {
hexdump ( ( uint32_t * ) prop - > value , prop - > sizebytes ) ;
if ( prop - > type = = KGSL_PROP_PWR_CONSTRAINT ) {
struct kgsl_device_constraint * constraint = ( struct kgsl_device_constraint * ) prop - > value ;
hexdump ( ( uint32_t * ) constraint - > data , constraint - > size ) ;
}
}
}
}
}
int ret = my_ioctl ( filedes , request , argp ) ;
if ( ret ! = 0 ) printf ( " ioctl returned %d with errno %d \n " , ret , errno ) ;
return ret ;
}
}
GPUMalloc : : GPUMalloc ( int size , int fd ) {
struct kgsl_gpuobj_alloc alloc ;
memset ( & alloc , 0 , sizeof ( alloc ) ) ;
alloc . size = size ;
alloc . flags = 0x10000a00 ;
ioctl ( fd , IOCTL_KGSL_GPUOBJ_ALLOC , & alloc ) ;
void * addr = mmap64 ( NULL , alloc . mmapsize , 0x3 , 0x1 , fd , alloc . id * 0x1000 ) ;
assert ( addr ! = MAP_FAILED ) ;
base = ( uint64_t ) addr ;
remaining = size ;
}
GPUMalloc : : ~ GPUMalloc ( ) {
// TODO: free the GPU malloced area
}
void * GPUMalloc : : alloc ( int size ) {
if ( size > remaining ) return NULL ;
remaining - = size ;
void * ret = ( void * ) base ;
base + = ( size + 0xff ) & ( ~ 0xFF ) ;
return ret ;
}
CachedCommand : : CachedCommand ( Thneed * lthneed , struct kgsl_gpu_command * cmd ) {
thneed = lthneed ;
assert ( cmd - > numcmds = = 2 ) ;
assert ( cmd - > numobjs = = 1 ) ;
assert ( cmd - > numsyncs = = 0 ) ;
memcpy ( cmds , ( void * ) cmd - > cmdlist , sizeof ( struct kgsl_command_object ) * 2 ) ;
memcpy ( objs , ( void * ) cmd - > objlist , sizeof ( struct kgsl_command_object ) * 1 ) ;
memcpy ( & cache , cmd , sizeof ( cache ) ) ;
cache . cmdlist = ( uint64_t ) cmds ;
cache . objlist = ( uint64_t ) objs ;
for ( int i = 0 ; i < cmd - > numcmds ; i + + ) {
void * nn = thneed - > ram - > alloc ( cmds [ i ] . size ) ;
memcpy ( nn , ( void * ) cmds [ i ] . gpuaddr , cmds [ i ] . size ) ;
cmds [ i ] . gpuaddr = ( uint64_t ) nn ;
}
for ( int i = 0 ; i < cmd - > numobjs ; i + + ) {
void * nn = thneed - > ram - > alloc ( objs [ i ] . size ) ;
memset ( nn , 0 , objs [ i ] . size ) ;
objs [ i ] . gpuaddr = ( uint64_t ) nn ;
}
}
void CachedCommand : : exec ( bool wait ) {
cache . timestamp = + + thneed - > timestamp ;
int ret = ioctl ( thneed - > fd , IOCTL_KGSL_GPU_COMMAND , & cache ) ;
if ( wait ) {
struct kgsl_device_waittimestamp_ctxtid wait ;
wait . context_id = cache . context_id ;
wait . timestamp = cache . timestamp ;
wait . timeout = - 1 ;
uint64_t tb = nanos_since_boot ( ) ;
int wret = ioctl ( thneed - > fd , IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID , & wait ) ;
uint64_t te = nanos_since_boot ( ) ;
if ( thneed - > record & 2 ) printf ( " exec %d wait %d after %lu us \n " , ret , wret , ( te - tb ) / 1000 ) ;
} else {
if ( thneed - > record & 2 ) printf ( " CachedCommand::exec got %d \n " , ret ) ;
}
assert ( ret = = 0 ) ;
}
Thneed : : Thneed ( ) {
assert ( g_fd ! = - 1 ) ;
fd = g_fd ;
ram = make_unique < GPUMalloc > ( 0x40000 , fd ) ;
record = 1 ;
timestamp = - 1 ;
g_thneed = this ;
}
void Thneed : : stop ( ) {
record = 0 ;
}
//#define SAVE_LOG
void Thneed : : execute ( float * * finputs , float * foutput , bool slow ) {
uint64_t tb , te ;
if ( record & 2 ) tb = nanos_since_boot ( ) ;
# ifdef SAVE_LOG
char fn [ 0x100 ] ;
snprintf ( fn , sizeof ( fn ) , " /tmp/thneed_log_%d " , timestamp ) ;
FILE * f = fopen ( fn , " wb " ) ;
# endif
// ****** copy inputs
for ( int idx = 0 ; idx < inputs . size ( ) ; + + idx ) {
size_t sz ;
clGetMemObjectInfo ( inputs [ idx ] , CL_MEM_SIZE , sizeof ( sz ) , & sz , NULL ) ;
# ifdef SAVE_LOG
fwrite ( & sz , 1 , sizeof ( sz ) , f ) ;
fwrite ( finputs [ idx ] , 1 , sz , f ) ;
# endif
if ( record & 2 ) printf ( " copying %lu -- %p -> %p \n " , sz , finputs [ idx ] , inputs [ idx ] ) ;
clEnqueueWriteBuffer ( command_queue , inputs [ idx ] , CL_TRUE , 0 , sz , finputs [ idx ] , 0 , NULL , NULL ) ;
}
// ****** set power constraint
struct kgsl_device_constraint_pwrlevel pwrlevel ;
pwrlevel . level = KGSL_CONSTRAINT_PWR_MAX ;
struct kgsl_device_constraint constraint ;
constraint . type = KGSL_CONSTRAINT_PWRLEVEL ;
constraint . context_id = context_id ;
constraint . data = ( void * ) & pwrlevel ;
constraint . size = sizeof ( pwrlevel ) ;
struct kgsl_device_getproperty prop ;
prop . type = KGSL_PROP_PWR_CONSTRAINT ;
prop . value = ( void * ) & constraint ;
prop . sizebytes = sizeof ( constraint ) ;
int ret = ioctl ( fd , IOCTL_KGSL_SETPROPERTY , & prop ) ;
assert ( ret = = 0 ) ;
// ****** run commands
int i = 0 ;
for ( auto it = cmds . begin ( ) ; it ! = cmds . end ( ) ; + + it ) {
+ + i ;
if ( record & 2 ) printf ( " run %2d: " , i ) ;
( * it ) - > exec ( ( i = = cmds . size ( ) ) | | slow ) ;
}
// ****** sync objects
for ( auto it = syncobjs . begin ( ) ; it ! = syncobjs . end ( ) ; + + it ) {
struct kgsl_gpuobj_sync cmd ;
cmd . objs = ( uint64_t ) it - > data ( ) ;
cmd . obj_len = it - > length ( ) ;
cmd . count = it - > length ( ) / sizeof ( struct kgsl_gpuobj_sync_obj ) ;
ret = ioctl ( fd , IOCTL_KGSL_GPUOBJ_SYNC , & cmd ) ;
assert ( ret = = 0 ) ;
}
// ****** copy outputs
size_t sz ;
clGetMemObjectInfo ( output , CL_MEM_SIZE , sizeof ( sz ) , & sz , NULL ) ;
if ( record & 2 ) printf ( " copying %lu for output %p -> %p \n " , sz , output , foutput ) ;
clEnqueueReadBuffer ( command_queue , output , CL_TRUE , 0 , sz , foutput , 0 , NULL , NULL ) ;
# ifdef SAVE_LOG
fwrite ( & sz , 1 , sizeof ( sz ) , f ) ;
fwrite ( foutput , 1 , sz , f ) ;
fclose ( f ) ;
# endif
// ****** unset power constraint
constraint . type = KGSL_CONSTRAINT_NONE ;
constraint . data = NULL ;
constraint . size = 0 ;
ret = ioctl ( fd , IOCTL_KGSL_SETPROPERTY , & prop ) ;
assert ( ret = = 0 ) ;
if ( record & 2 ) {
te = nanos_since_boot ( ) ;
printf ( " model exec in %lu us \n " , ( te - tb ) / 1000 ) ;
}
}
// TODO: with a different way of getting the input and output buffers, we don't have to intercept CL at all
cl_int ( * my_clSetKernelArg ) ( cl_kernel kernel , cl_uint arg_index , size_t arg_size , const void * arg_value ) = NULL ;
cl_int thneed_clSetKernelArg ( cl_kernel kernel , cl_uint arg_index , size_t arg_size , const void * arg_value ) {
if ( my_clSetKernelArg = = NULL ) my_clSetKernelArg = reinterpret_cast < decltype ( my_clSetKernelArg ) > ( dlsym ( RTLD_NEXT , " REAL_clSetKernelArg " ) ) ;
if ( arg_value ! = NULL ) {
g_args [ make_pair ( kernel , arg_index ) ] = string ( ( char * ) arg_value , arg_size ) ;
}
cl_int ret = my_clSetKernelArg ( kernel , arg_index , arg_size , arg_value ) ;
return ret ;
}
cl_int ( * my_clEnqueueNDRangeKernel ) ( cl_command_queue , cl_kernel , cl_uint , const size_t * , const size_t * , const size_t * , cl_uint , const cl_event * , cl_event * ) = NULL ;
cl_int thneed_clEnqueueNDRangeKernel ( cl_command_queue command_queue ,
cl_kernel kernel ,
cl_uint work_dim ,
const size_t * global_work_offset ,
const size_t * global_work_size ,
const size_t * local_work_size ,
cl_uint num_events_in_wait_list ,
const cl_event * event_wait_list ,
cl_event * event ) {
if ( my_clEnqueueNDRangeKernel = = NULL ) my_clEnqueueNDRangeKernel = reinterpret_cast < decltype ( my_clEnqueueNDRangeKernel ) > ( dlsym ( RTLD_NEXT , " REAL_clEnqueueNDRangeKernel " ) ) ;
Thneed * thneed = g_thneed ;
// SNPE doesn't use these
assert ( num_events_in_wait_list = = 0 ) ;
assert ( global_work_offset = = NULL ) ;
char name [ 0x100 ] ;
clGetKernelInfo ( kernel , CL_KERNEL_FUNCTION_NAME , sizeof ( name ) , name , NULL ) ;
cl_uint num_args ;
clGetKernelInfo ( kernel , CL_KERNEL_NUM_ARGS , sizeof ( num_args ) , & num_args , NULL ) ;
if ( thneed ! = NULL & & thneed - > record & 1 ) {
thneed - > command_queue = command_queue ;
for ( int i = 0 ; i < num_args ; i + + ) {
char arg_name [ 0x100 ] ;
clGetKernelArgInfo ( kernel , i , CL_KERNEL_ARG_NAME , sizeof ( arg_name ) , arg_name , NULL ) ;
string arg = g_args [ make_pair ( kernel , i ) ] ;
if ( strcmp ( arg_name , " input " ) = = 0 & & strcmp ( name , " zero_pad_image_float " ) = = 0 ) {
cl_mem mem ;
memcpy ( & mem , ( void * ) arg . data ( ) , sizeof ( mem ) ) ;
thneed - > inputs . push_back ( mem ) ;
}
if ( strcmp ( arg_name , " output " ) = = 0 & & strcmp ( name , " image2d_to_buffer_float " ) = = 0 ) {
cl_mem mem ;
memcpy ( & mem , ( void * ) arg . data ( ) , sizeof ( mem ) ) ;
thneed - > output = mem ;
}
}
}
if ( thneed ! = NULL & & thneed - > record & 2 ) {
printf ( " %p %56s -- " , kernel , name ) ;
for ( int i = 0 ; i < work_dim ; i + + ) {
printf ( " %4zu " , global_work_size [ i ] ) ;
}
printf ( " -- " ) ;
for ( int i = 0 ; i < work_dim ; i + + ) {
printf ( " %4zu " , local_work_size [ i ] ) ;
}
printf ( " \n " ) ;
}
if ( thneed ! = NULL & & thneed - > record & 4 ) {
// extreme debug
for ( int i = 0 ; i < num_args ; i + + ) {
char arg_type [ 0x100 ] ;
char arg_name [ 0x100 ] ;
clGetKernelArgInfo ( kernel , i , CL_KERNEL_ARG_TYPE_NAME , sizeof ( arg_type ) , arg_type , NULL ) ;
clGetKernelArgInfo ( kernel , i , CL_KERNEL_ARG_NAME , sizeof ( arg_name ) , arg_name , NULL ) ;
string arg = g_args [ make_pair ( kernel , i ) ] ;
printf ( " %s %s " , arg_type , arg_name ) ;
void * arg_value = ( void * ) arg . data ( ) ;
int arg_size = arg . size ( ) ;
if ( arg_size = = 1 ) {
printf ( " = %d " , * ( ( char * ) arg_value ) ) ;
} else if ( arg_size = = 2 ) {
printf ( " = %d " , * ( ( short * ) arg_value ) ) ;
} else if ( arg_size = = 4 ) {
if ( strcmp ( arg_type , " float " ) = = 0 ) {
printf ( " = %f " , * ( ( float * ) arg_value ) ) ;
} else {
printf ( " = %d " , * ( ( int * ) arg_value ) ) ;
}
} else if ( arg_size = = 8 ) {
cl_mem val = ( cl_mem ) ( * ( ( uintptr_t * ) arg_value ) ) ;
printf ( " = %p " , val ) ;
if ( val ! = NULL ) {
if ( strcmp ( " image2d_t " , arg_type ) = = 0 | | strcmp ( " image1d_t " , arg_type ) = = 0 ) {
cl_image_format format ;
size_t width , height , depth , array_size , row_pitch , slice_pitch ;
clGetImageInfo ( val , CL_IMAGE_FORMAT , sizeof ( format ) , & format , NULL ) ;
assert ( format . image_channel_data_type = = CL_HALF_FLOAT ) ;
clGetImageInfo ( val , CL_IMAGE_WIDTH , sizeof ( width ) , & width , NULL ) ;
clGetImageInfo ( val , CL_IMAGE_HEIGHT , sizeof ( height ) , & height , NULL ) ;
clGetImageInfo ( val , CL_IMAGE_DEPTH , sizeof ( depth ) , & depth , NULL ) ;
clGetImageInfo ( val , CL_IMAGE_ARRAY_SIZE , sizeof ( array_size ) , & array_size , NULL ) ;
clGetImageInfo ( val , CL_IMAGE_ROW_PITCH , sizeof ( row_pitch ) , & row_pitch , NULL ) ;
clGetImageInfo ( val , CL_IMAGE_SLICE_PITCH , sizeof ( slice_pitch ) , & slice_pitch , NULL ) ;
assert ( depth = = 0 ) ;
assert ( array_size = = 0 ) ;
assert ( slice_pitch = = 0 ) ;
printf ( " image %zu x %zu rp %zu " , width , height , row_pitch ) ;
} else {
size_t sz ;
clGetMemObjectInfo ( val , CL_MEM_SIZE , sizeof ( sz ) , & sz , NULL ) ;
printf ( " buffer %zu " , sz ) ;
}
}
}
printf ( " \n " ) ;
}
}
cl_int ret = my_clEnqueueNDRangeKernel ( command_queue , kernel , work_dim ,
global_work_offset , global_work_size , local_work_size ,
num_events_in_wait_list , event_wait_list , event ) ;
/*uint64_t tb = nanos_since_boot();
clWaitForEvents ( 1 , event ) ;
uint64_t te = nanos_since_boot ( ) ;
if ( thneed ! = NULL & & thneed - > record & 2 ) {
printf ( " wait %lu us \n " , ( te - tb ) / 1000 ) ;
} */
return ret ;
}
//#define SAVE_KERNELS
# ifdef SAVE_KERNELS
map < cl_program , string > program_source ;
cl_program ( * my_clCreateProgramWithSource ) ( cl_context context , cl_uint count , const char * * strings , const size_t * lengths , cl_int * errcode_ret ) = NULL ;
cl_program thneed_clCreateProgramWithSource ( cl_context context , cl_uint count , const char * * strings , const size_t * lengths , cl_int * errcode_ret ) {
if ( my_clCreateProgramWithSource = = NULL ) my_clCreateProgramWithSource = reinterpret_cast < decltype ( my_clCreateProgramWithSource ) > ( dlsym ( RTLD_NEXT , " REAL_clCreateProgramWithSource " ) ) ;
assert ( count = = 1 ) ;
size_t my_lengths [ 1 ] ;
my_lengths [ 0 ] = lengths [ 0 ] ;
char fn [ 0x100 ] ;
snprintf ( fn , sizeof ( fn ) , " /tmp/program_%zu.cl " , strlen ( strings [ 0 ] ) ) ;
FILE * f = fopen ( fn , " wb " ) ;
fprintf ( f , " %s " , strings [ 0 ] ) ;
fclose ( f ) ;
char tmp [ 0x10000 ] ;
memset ( tmp , 0 , sizeof ( tmp ) ) ;
snprintf ( fn , sizeof ( fn ) , " /tmp/patched_%zu.cl " , strlen ( strings [ 0 ] ) ) ;
FILE * g = fopen ( fn , " rb " ) ;
if ( g ! = NULL ) {
printf ( " LOADING PATCHED PROGRAM %s \n " , fn ) ;
fread ( tmp , 1 , sizeof ( tmp ) , g ) ;
fclose ( g ) ;
strings [ 0 ] = tmp ;
my_lengths [ 0 ] = strlen ( tmp ) ;
}
program_source [ ret ] = strings [ 0 ] ;
cl_program ret = my_clCreateProgramWithSource ( context , count , strings , my_lengths , errcode_ret ) ;
return ret ;
}
# endif
void * dlsym ( void * handle , const char * symbol ) {
void * ( * my_dlsym ) ( void * handle , const char * symbol ) = ( void * ( * ) ( void * handle , const char * symbol ) ) ( ( uintptr_t ) dlopen - 0x2d4 ) ;
if ( memcmp ( " REAL_ " , symbol , 5 ) = = 0 ) {
return my_dlsym ( handle , symbol + 5 ) ;
} else if ( strcmp ( " clEnqueueNDRangeKernel " , symbol ) = = 0 ) {
return ( void * ) thneed_clEnqueueNDRangeKernel ;
} else if ( strcmp ( " clSetKernelArg " , symbol ) = = 0 ) {
return ( void * ) thneed_clSetKernelArg ;
# ifdef SAVE_KERNELS
} else if ( strcmp ( " clCreateProgramWithSource " , symbol ) = = 0 ) {
return ( void * ) thneed_clCreateProgramWithSource ;
# endif
} else {
return my_dlsym ( handle , symbol ) ;
}
}