# include  <stdio.h> 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# include  <stdlib.h> 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# include  <CL/cl.h> 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# include  <assert.h> 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# include  <time.h> 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								/*
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								block7b_project_conv  ( Conv2D )    ( None ,  8 ,  16 ,  352 )    743424       block7b_activation [ 0 ] [ 0 ] 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								8448 * 8 * 4  =  8 * 16 * 2112  =  270336  =  input  =  128 * 2112 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								2112 * 88 * 4  =  743424  =  weights  =  2112 * 352 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								1408 * 8 * 4  =  8 * 16 * 352  =  45056  =  output  =  128 * 352 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								FLOPS  =  128 * 2112 * 352  =  95158272  =  95  MFLOPS 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								RAM  =  128 * 2112  +  2112 * 352  +  128 * 352  =  1058816  =  1  M  accesses 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								#  22 groups 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								128 * 2112  +  2112 * 16  +  128 * 16  =  306176 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								306176 * 22  =  6735872  real  accesses 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								This  is  a  128 x2112  by  2112 x352  matrix  multiply 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								work_size  =  { 88 ,  4 ,  8 } 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								Each  kernel  run  computes  16  outputs 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								0x7f7e8a6380                  convolution_horizontal_reduced_reads_1x1  - -    88     4     8   - -     4     4     8 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  image2d_t  input  =  0x7f7f490b00  image  8448  x  8  rp  67840 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  short  startPackedInputChannel  =  0 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  short  numPackedInputChannelsForGroup  =  528 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  short  totalNumPackedInputChannels  =  528 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  short  packedOuputChannelOffset  =  0 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  short  totalNumPackedOutputChannels  =  88 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  image2d_t  weights  =  0x7f7f52fb80  image  2112  x  88  rp  16896 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  float *  biases  =  0x7f7f564d80  buffer  1408 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  short  filterSizeX  =  1 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  short  filterSizeY  =  1 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  image2d_t  output  =  0x7f7f490e80  image  1408  x  8  rp  11264 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  short  paddingX  =  0 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  short  paddingY  =  0 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  short  strideX  =  1 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  short  strideY  =  1 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  short  neuron  =  0 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  float  a  =  1.000000 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  float  b  =  1.000000 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  float  min_clamp  =  0.000000 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  float  max_clamp  =  0.000000 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  float *  parameters  =  0x0 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  float *  batchNormBiases  =  0x0 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  short  numOutputColumns  =  16 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								*/ 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# define GEMM 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# define IMAGE 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								void  dump_maps ( )  { 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  FILE  * f  =  fopen ( " /proc/self/maps " ,  " rb " ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  char  maps [ 0x100000 ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  int  len  =  fread ( maps ,  1 ,  sizeof ( maps ) ,  f ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  maps [ len ]  =  ' \0 ' ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  maps [ 0x800 ]  =  ' \0 ' ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  fclose ( f ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  printf ( " %s \n " ,  maps ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								} 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								static  inline  uint64_t  nanos_since_boot ( )  { 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  struct  timespec  t ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clock_gettime ( CLOCK_BOOTTIME ,  & t ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  return  t . tv_sec  *  1000000000ULL  +  t . tv_nsec ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								} 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								int  main ( int  argc ,  char  * argv [ ] )  { 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_int  err ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  // cl init
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_device_id  device_id ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_context  context ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_command_queue  q ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    cl_platform_id  platform_id [ 2 ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    cl_uint  num_devices ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    cl_uint  num_platforms ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    err  =  clGetPlatformIDs ( sizeof ( platform_id ) / sizeof ( cl_platform_id ) ,  platform_id ,  & num_platforms ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    err  =  clGetDeviceIDs ( platform_id [ 0 ] ,  CL_DEVICE_TYPE_DEFAULT ,  1 ,  & device_id ,  & num_devices ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    context  =  clCreateContext ( NULL ,  1 ,  & device_id ,  NULL ,  NULL ,  & err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    q  =  clCreateCommandQueue ( context ,  device_id ,  0 ,  & err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  printf ( " cl ready \n " ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  char  tmp [ 0x10000 ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  memset ( tmp ,  0 ,  sizeof ( tmp ) ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  FILE  * f  =  fopen ( argv [ 1 ] ,  " rb " ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  fread ( tmp ,  1 ,  sizeof ( tmp ) ,  f ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  fclose ( f ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  const  char  * strings [ 1 ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  size_t  lengths [ 1 ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  strings [ 0 ]  =  tmp ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  lengths [ 0 ]  =  strlen ( tmp ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_program  prog  =  clCreateProgramWithSource ( context ,  1 ,  strings ,  lengths ,  & err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  printf ( " creating program \n " ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  err  =  clBuildProgram ( prog ,  1 ,  & device_id ,  " -D AVANTE_IS_GPU_A530_64 " ,  NULL ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  if  ( err  ! =  0 )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    printf ( " got err %d \n " ,  err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    size_t  length ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    char  buffer [ 2048 ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    clGetProgramBuildInfo ( prog ,  device_id ,  CL_PROGRAM_BUILD_LOG ,  sizeof ( buffer ) ,  buffer ,  & length ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    buffer [ length ]  =  ' \0 ' ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    printf ( " %s \n " ,  buffer ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  printf ( " built program \n " ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# ifdef GEMM 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  // 128x2112 by 2112x352
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  int  M , N , K ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  M  =  N  =  K  =  1024 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  //M = 128; K = 2112; N = 352;
  
							 
						 
					
						
							
								
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_kernel  kern  =  clCreateKernel ( prog ,  " gemm " ,  & err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  printf ( " creating kernel %p \n " ,  kern ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_mem  A , B , C ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  A  =  clCreateBuffer ( context ,  CL_MEM_READ_WRITE ,  M * K * 2 ,  NULL ,  & err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  B  =  clCreateBuffer ( context ,  CL_MEM_READ_WRITE ,  K * N * 2 ,  NULL ,  & err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  C  =  clCreateBuffer ( context ,  CL_MEM_READ_WRITE ,  M * N * 2 ,  NULL ,  & err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  printf ( " created buffers \n " ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# ifdef IMAGE 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_image_format  fmt ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  fmt . image_channel_order  =  CL_RGBA ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  fmt . image_channel_data_type  =  CL_HALF_FLOAT ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_image_desc  desc ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . image_type  =  CL_MEM_OBJECT_IMAGE2D ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . image_depth  =  0 ;  desc . image_slice_pitch  =  0 ;  desc . num_mip_levels  =  0 ;  desc . num_samples  =  0 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . image_width  =  K ;  desc . image_height  =  M / 4 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . buffer  =  A ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . image_row_pitch  =  desc . image_width * 8 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  A  =  clCreateImage ( context ,  CL_MEM_READ_WRITE ,  & fmt ,  & desc ,  NULL ,  & err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . image_width  =  K ;  desc . image_height  =  N / 4 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . buffer  =  B ;  desc . image_row_pitch  =  desc . image_width * 8 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  B  =  clCreateImage ( context ,  CL_MEM_READ_WRITE ,  & fmt ,  & desc ,  NULL ,  & err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . image_width  =  M / 4 ;  desc . image_height  =  N ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . buffer  =  C ;  desc . image_row_pitch  =  desc . image_width * 8 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  C  =  clCreateImage ( context ,  CL_MEM_READ_WRITE ,  & fmt ,  & desc ,  NULL ,  & err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  printf ( " created images \n " ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# endif 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clSetKernelArg ( kern ,  0 ,  sizeof ( int ) ,  & M ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clSetKernelArg ( kern ,  1 ,  sizeof ( int ) ,  & N ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clSetKernelArg ( kern ,  2 ,  sizeof ( int ) ,  & K ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clSetKernelArg ( kern ,  3 ,  sizeof ( cl_mem ) ,  & A ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clSetKernelArg ( kern ,  4 ,  sizeof ( cl_mem ) ,  & B ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clSetKernelArg ( kern ,  5 ,  sizeof ( cl_mem ) ,  & C ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  printf ( " set args \n " ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# ifdef IMAGE 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  size_t  global_work_size [ 3 ]  =  { M / 4 ,  N / 4 ,  1 } ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  size_t  local_work_size [ 3 ]  =  { 4 ,  64 ,  1 } ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# else 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  size_t  global_work_size [ 3 ]  =  { 128 ,  128 ,  1 } ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  size_t  local_work_size [ 3 ]  =  { 2 ,  128 ,  1 } ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# endif 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# else 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_kernel  kern  =  clCreateKernel ( prog ,  " convolution_horizontal_reduced_reads_1x1 " ,  & err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  printf ( " creating kernel \n " ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_mem  input ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_mem  weights ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_mem  weights_buffer ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_mem  biases ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_mem  outputs ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_image_format  fmt ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  fmt . image_channel_order  =  CL_RGBA ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  fmt . image_channel_data_type  =  CL_HALF_FLOAT ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_image_desc  desc ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . image_type  =  CL_MEM_OBJECT_IMAGE2D ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . image_depth  =  0 ;  desc . image_slice_pitch  =  0 ;  desc . num_mip_levels  =  0 ;  desc . num_samples  =  0 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . buffer  =  NULL ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  biases  =  clCreateBuffer ( context ,  CL_MEM_READ_WRITE ,  1408 ,  NULL ,  & err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . image_width  =  8448 ;  desc . image_height  =  8 ;  desc . image_row_pitch  =  67840 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . buffer  =  clCreateBuffer ( context ,  CL_MEM_READ_WRITE ,  desc . image_height  *  desc . image_row_pitch ,  NULL ,  & err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  input  =  clCreateImage ( context ,  CL_MEM_READ_WRITE ,  & fmt ,  & desc ,  NULL ,  & err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . image_width  =  2112 ;  desc . image_height  =  88 ;  desc . image_row_pitch  =  16896 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  weights_buffer  =  desc . buffer  =  clCreateBuffer ( context ,  CL_MEM_READ_WRITE ,  desc . image_height  *  desc . image_row_pitch ,  NULL ,  & err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  weights  =  clCreateImage ( context ,  CL_MEM_READ_WRITE ,  & fmt ,  & desc ,  NULL ,  & err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . image_width  =  1408 ;  desc . image_height  =  8 ;  desc . image_row_pitch  =  11264 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . buffer  =  clCreateBuffer ( context ,  CL_MEM_READ_WRITE ,  desc . image_height  *  desc . image_row_pitch ,  NULL ,  & err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  outputs  =  clCreateImage ( context ,  CL_MEM_READ_WRITE ,  & fmt ,  & desc ,  NULL ,  & err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  void  * n  =  NULL ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  uint16_t  v ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  float  fl ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clSetKernelArg ( kern ,  0 ,  sizeof ( cl_mem ) ,  & input ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  v  =  0 ;  clSetKernelArg ( kern ,  1 ,  sizeof ( v ) ,  & v ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  v  =  528 ;  clSetKernelArg ( kern ,  2 ,  sizeof ( v ) ,  & v ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  v  =  528 ;  clSetKernelArg ( kern ,  3 ,  sizeof ( v ) ,  & v ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  v  =  0 ;  clSetKernelArg ( kern ,  4 ,  sizeof ( v ) ,  & v ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  v  =  88 ;  clSetKernelArg ( kern ,  5 ,  sizeof ( v ) ,  & v ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clSetKernelArg ( kern ,  6 ,  sizeof ( cl_mem ) ,  & weights ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  //clSetKernelArg(kern, 6, sizeof(cl_mem), &weights_buffer);
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clSetKernelArg ( kern ,  7 ,  sizeof ( cl_mem ) ,  & biases ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  v  =  1 ;  clSetKernelArg ( kern ,  8 ,  sizeof ( v ) ,  & v ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  v  =  1 ;  clSetKernelArg ( kern ,  9 ,  sizeof ( v ) ,  & v ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clSetKernelArg ( kern ,  10 ,  sizeof ( cl_mem ) ,  & outputs ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  v  =  0 ;  clSetKernelArg ( kern ,  11 ,  sizeof ( v ) ,  & v ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  v  =  0 ;  clSetKernelArg ( kern ,  12 ,  sizeof ( v ) ,  & v ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  v  =  1 ;  clSetKernelArg ( kern ,  13 ,  sizeof ( v ) ,  & v ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  v  =  1 ;  clSetKernelArg ( kern ,  14 ,  sizeof ( v ) ,  & v ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  v  =  0 ;  clSetKernelArg ( kern ,  15 ,  sizeof ( v ) ,  & v ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  fl  =  1.0 ;  clSetKernelArg ( kern ,  16 ,  sizeof ( fl ) ,  & fl ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  fl  =  0.0 ;  clSetKernelArg ( kern ,  17 ,  sizeof ( fl ) ,  & fl ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  fl  =  0.0 ;  clSetKernelArg ( kern ,  18 ,  sizeof ( fl ) ,  & fl ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  fl  =  0.0 ;  clSetKernelArg ( kern ,  19 ,  sizeof ( fl ) ,  & fl ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clSetKernelArg ( kern ,  20 ,  sizeof ( n ) ,  & n ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clSetKernelArg ( kern ,  21 ,  sizeof ( n ) ,  & n ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  v  =  16 ;  clSetKernelArg ( kern ,  22 ,  sizeof ( v ) ,  & v ) ; 
 
							 
						 
					
						
							
								
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  size_t  global_work_size [ 3 ]  =  { 88 ,  4 ,  8 } ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  size_t  local_work_size [ 3 ]  =  { 4 ,  4 ,  8 } ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# endif 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  printf ( " ready to enqueue \n " ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  for  ( int  i  =  0 ;  i  <  20 ;  i + + )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    cl_event  event ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    err  =  clEnqueueNDRangeKernel ( q ,  kern ,  3 ,  NULL ,  global_work_size ,  local_work_size ,  0 ,  NULL ,  & event ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    uint64_t  tb  =  nanos_since_boot ( ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    err  =  clWaitForEvents ( 1 ,  & event ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    uint64_t  te  =  nanos_since_boot ( ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    uint64_t  us  =  ( te - tb ) / 1000 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    float  s  =  1000000.0 / us ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# ifdef GEMM 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    float  flops  =  M * N * K * s ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    float  rams  =  ( M * N  +  N * K  +  M * K ) * s ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# else 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    float  flops  =  95158272.0 * s ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    float  rams  =  1058816.0 * s ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    //float rams = 6735872.0*s;
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# endif 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    printf ( " %2d: wait %lu us -- %.2f GFLOPS -- %.2f GB/s \n " ,  i ,  us ,  flops / 1e9 ,  rams * 2 / 1e9 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  size_t  binary_size  =  0 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  err  =  clGetProgramInfo ( prog ,  CL_PROGRAM_BINARY_SIZES ,  sizeof ( binary_size ) ,  & binary_size ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( binary_size  >  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  uint8_t  * binary_buf  =  ( uint8_t  * ) malloc ( binary_size ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( binary_buf ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  uint8_t *  bufs [ 1 ]  =  {  binary_buf ,  } ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  err  =  clGetProgramInfo ( prog ,  CL_PROGRAM_BINARIES ,  sizeof ( bufs ) ,  & bufs ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( err  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  FILE  * g  =  fopen ( " /tmp/bin.bin " ,  " wb " ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  fwrite ( binary_buf ,  1 ,  binary_size ,  g ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  fclose ( g ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  /*dump_maps();
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  for  ( uint64_t  i  =  0x7ffbd2000 ;  i  <  0x800000000 ;  i  + =  0x1000 )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    uint64_t  cmd  =  * ( ( uint64_t * ) i ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    printf ( " %llx: %llx \n " ,  i ,  cmd ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  } */ 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  return  0 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								}