# pragma once
# ifndef __user
# define __user __attribute__(())
# endif
# include <cstdint>
# include <cstdlib>
# include <memory>
# include <string>
# include <vector>
# include <CL/cl.h>
# include "msm_kgsl.h"
using namespace std ;
cl_int thneed_clSetKernelArg ( cl_kernel kernel , cl_uint arg_index , size_t arg_size , const void * arg_value ) ;
namespace json11 {
class Json ;
}
class Thneed ;
class GPUMalloc {
public :
GPUMalloc ( int size , int fd ) ;
~ GPUMalloc ( ) ;
void * alloc ( int size ) ;
private :
uint64_t base ;
int remaining ;
} ;
class CLQueuedKernel {
public :
CLQueuedKernel ( Thneed * lthneed ) { thneed = lthneed ; }
CLQueuedKernel ( Thneed * lthneed ,
cl_kernel _kernel ,
cl_uint _work_dim ,
const size_t * _global_work_size ,
const size_t * _local_work_size ) ;
cl_int exec ( ) ;
void debug_print ( bool verbose ) ;
int get_arg_num ( const char * search_arg_name ) ;
cl_program program ;
string name ;
cl_uint num_args ;
vector < string > arg_names ;
vector < string > arg_types ;
vector < string > args ;
vector < int > args_size ;
cl_kernel kernel = NULL ;
json11 : : Json to_json ( ) const ;
cl_uint work_dim ;
size_t global_work_size [ 3 ] = { 0 } ;
size_t local_work_size [ 3 ] = { 0 } ;
private :
Thneed * thneed ;
} ;
class CachedIoctl {
public :
virtual void exec ( ) { }
} ;
class CachedSync : public CachedIoctl {
public :
CachedSync ( Thneed * lthneed , string ldata ) { thneed = lthneed ; data = ldata ; }
void exec ( ) ;
private :
Thneed * thneed ;
string data ;
} ;
class CachedCommand : public CachedIoctl {
public :
CachedCommand ( Thneed * lthneed , struct kgsl_gpu_command * cmd ) ;
void exec ( ) ;
private :
void disassemble ( int cmd_index ) ;
struct kgsl_gpu_command cache ;
unique_ptr < kgsl_command_object [ ] > cmds ;
unique_ptr < kgsl_command_object [ ] > objs ;
Thneed * thneed ;
vector < shared_ptr < CLQueuedKernel > > kq ;
} ;
class Thneed {
public :
Thneed ( bool do_clinit = false , cl_context _context = NULL ) ;
void stop ( ) ;
void execute ( float * * finputs , float * foutput , bool slow = false ) ;
void wait ( ) ;
vector < cl_mem > input_clmem ;
vector < void * > inputs ;
vector < size_t > input_sizes ;
cl_mem output = NULL ;
cl_context context = NULL ;
cl_command_queue command_queue ;
cl_device_id device_id ;
int context_id ;
// protected?
bool record = false ;
int debug ;
int timestamp ;
# ifdef QCOM2
unique_ptr < GPUMalloc > ram ;
vector < unique_ptr < CachedIoctl > > cmds ;
int fd ;
# endif
// all CL kernels
void copy_inputs ( float * * finputs , bool internal = false ) ;
void copy_output ( float * foutput ) ;
cl_int clexec ( ) ;
vector < shared_ptr < CLQueuedKernel > > kq ;
// pending CL kernels
vector < shared_ptr < CLQueuedKernel > > ckq ;
// loading
void load ( const char * filename ) ;
private :
void clinit ( ) ;
} ;