# include  <map> 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# include  <string> 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# include  <string.h> 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# include  <assert.h> 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# include  "thneed.h" 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								# include  "common/util.h" 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# include  "common/clutil.h" 
 
							 
						 
					
						
							
								
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								extern  map < cl_program ,  string >  g_program_source ; 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								static  int  is_same_size_image ( cl_mem  a ,  cl_mem  b )  { 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  size_t  a_width ,  a_height ,  a_depth ,  a_array_size ,  a_row_pitch ,  a_slice_pitch ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clGetImageInfo ( a ,  CL_IMAGE_WIDTH ,  sizeof ( a_width ) ,  & a_width ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clGetImageInfo ( a ,  CL_IMAGE_HEIGHT ,  sizeof ( a_height ) ,  & a_height ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clGetImageInfo ( a ,  CL_IMAGE_DEPTH ,  sizeof ( a_depth ) ,  & a_depth ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clGetImageInfo ( a ,  CL_IMAGE_ARRAY_SIZE ,  sizeof ( a_array_size ) ,  & a_array_size ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clGetImageInfo ( a ,  CL_IMAGE_ROW_PITCH ,  sizeof ( a_row_pitch ) ,  & a_row_pitch ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clGetImageInfo ( a ,  CL_IMAGE_SLICE_PITCH ,  sizeof ( a_slice_pitch ) ,  & a_slice_pitch ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  size_t  b_width ,  b_height ,  b_depth ,  b_array_size ,  b_row_pitch ,  b_slice_pitch ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clGetImageInfo ( b ,  CL_IMAGE_WIDTH ,  sizeof ( b_width ) ,  & b_width ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clGetImageInfo ( b ,  CL_IMAGE_HEIGHT ,  sizeof ( b_height ) ,  & b_height ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clGetImageInfo ( b ,  CL_IMAGE_DEPTH ,  sizeof ( b_depth ) ,  & b_depth ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clGetImageInfo ( b ,  CL_IMAGE_ARRAY_SIZE ,  sizeof ( b_array_size ) ,  & b_array_size ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clGetImageInfo ( b ,  CL_IMAGE_ROW_PITCH ,  sizeof ( b_row_pitch ) ,  & b_row_pitch ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clGetImageInfo ( b ,  CL_IMAGE_SLICE_PITCH ,  sizeof ( b_slice_pitch ) ,  & b_slice_pitch ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  return  ( a_width  = =  b_width )  & &  ( a_height  = =  b_height )  & & 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    ( a_depth  = =  b_depth )  & &  ( a_array_size  = =  b_array_size )  & & 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    ( a_row_pitch  = =  b_row_pitch )  & &  ( a_slice_pitch  = =  b_slice_pitch ) ; 
 
							 
						 
					
						
							
								
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								} 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								static  cl_mem  make_image_like ( cl_context  context ,  cl_mem  val )  { 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_image_format  format ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  size_t  width ,  height ,  row_pitch ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clGetImageInfo ( val ,  CL_IMAGE_FORMAT ,  sizeof ( format ) ,  & format ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( format . image_channel_order  = =  CL_RGBA ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( format . image_channel_data_type  = =  CL_HALF_FLOAT ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clGetImageInfo ( val ,  CL_IMAGE_WIDTH ,  sizeof ( width ) ,  & width ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clGetImageInfo ( val ,  CL_IMAGE_HEIGHT ,  sizeof ( height ) ,  & height ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clGetImageInfo ( val ,  CL_IMAGE_ROW_PITCH ,  sizeof ( row_pitch ) ,  & row_pitch ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_image_desc  desc  =  { 0 } ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . image_type  =  CL_MEM_OBJECT_IMAGE2D ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . image_width  =  width ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . image_height  =  height ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . image_row_pitch  =  row_pitch ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_mem  buf  =  clCreateBuffer ( context ,  CL_MEM_READ_WRITE ,  row_pitch * height ,  NULL ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( buf  ! =  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  desc . buffer  =  buf ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_int  err ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  cl_mem  tmp  =  clCreateImage ( context ,  CL_MEM_READ_WRITE ,  & format ,  & desc ,  NULL ,  & err ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  //printf("got %d for image %zux%zu %zu\n", err, width, height, row_pitch);
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  assert ( tmp  ! =  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  return  tmp ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								} 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								// convolution_horizontal_reduced_reads_1x1 is 66% of the model runtime
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								// make that faster and the model gets faster
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								// this cuts ~2 ms off the model runtime right now
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								int  Thneed : : optimize ( )  { 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  const  char  * kernel_path  =  getenv ( " KERNEL_PATH " ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  if  ( ! kernel_path )  {  kernel_path  =  " /data/openpilot/selfdrive/modeld/thneed/kernels " ;  printf ( " no KERNEL_PATH set, defaulting to %s \n " ,  kernel_path ) ;  } 
 
							 
						 
					
						
							
								
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  string  convolution_ ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    char  fn [ 0x100 ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    snprintf ( fn ,  sizeof ( fn ) ,  " %s/%s.cl " ,  kernel_path ,  " convolution_ " ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    convolution_  =  util : : read_file ( fn ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  // load custom kernels
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  map < string ,  cl_program >  g_programs ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  for  ( auto  & k  :  kq )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    // replace program?
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    if  ( g_programs . find ( k - > name )  = =  g_programs . end ( ) )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      char  fn [ 0x100 ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      snprintf ( fn ,  sizeof ( fn ) ,  " %s/%s.cl " ,  kernel_path ,  k - > name . c_str ( ) ) ; 
 
							 
						 
					
						
							
								
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								      if  ( util : : file_exists ( fn ) )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        string  kernel_src  =  util : : read_file ( fn ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        if  ( k - > name . rfind ( " convolution_ " ,  0 )  = =  0 )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          kernel_src  + =  convolution_ ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        } 
 
							 
						 
					
						
							
								
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								        printf ( " building kernel %s with len %lu \n " ,  k - > name . c_str ( ) ,  kernel_src . length ( ) ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        k - > program  =  cl_program_from_source ( context ,  device_id ,  kernel_src ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        // save in cache
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        g_programs [ k - > name ]  =  k - > program ; 
 
							 
						 
					
						
							
								
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								        g_program_source [ k - > program ]  =  kernel_src ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      }  else  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        g_programs [ k - > name ]  =  NULL ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    }  else  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      // cached replacement
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      if  ( g_programs [ k - > name ]  ! =  NULL )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        k - > program  =  g_programs [ k - > name ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    // hack in accumulator to convolution_horizontal_reduced_reads_1x1
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    if  ( k - > name  = =  " convolution_horizontal_reduced_reads_1x1 " )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      k - > arg_names . push_back ( " doAccumulate " ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      short  doAccumulate  =  0 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      k - > args . push_back ( string ( ( char  * ) & doAccumulate ,  sizeof ( doAccumulate ) ) ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      k - > args_size . push_back ( 2 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      k - > arg_names . push_back ( " accumulator " ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      k - > args . push_back ( k - > args [ k - > get_arg_num ( " output " ) ] ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      k - > args_size . push_back ( 8 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      k - > num_args  + =  2 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    // assert that parameters + batchNormBiases are not used
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    // since they aren't supported in custom replacement kernels
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    if  ( k - > name  = =  " convolution_horizontal_reduced_reads_1x1 "  | | 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        k - > name  = =  " convolution_horizontal_reduced_reads "  | | 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        k - > name  = =  " convolution_horizontal_reduced_reads_5_outputs " )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      string  p1  =  k - > args [ k - > get_arg_num ( " parameters " ) ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      string  p2  =  k - > args [ k - > get_arg_num ( " batchNormBiases " ) ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      assert ( p1 . length ( )  = =  8  & &  * ( ( uint64_t * ) p1 . data ( ) )  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      assert ( p2 . length ( )  = =  8  & &  * ( ( uint64_t * ) p2 . data ( ) )  = =  0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  // optimizer
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  size_t  start_size ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  do  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    start_size  =  kq . size ( ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    // get optimizations
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    map < string ,  string >  replacements ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    for  ( int  i  =  0 ;  i  <  kq . size ( ) ;  i + + )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      // fusing elementwise_sum + activate_image will save 3 enqueues
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      // delete useless copy layers
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      // saves ~0.7 ms
  
							 
						 
					
						
							
								
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								      if  ( kq [ i ] - > name  = =  " concatenation "  | |  kq [ i ] - > name  = =  " flatten " )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        string  in  =  kq [ i ] - > args [ kq [ i ] - > get_arg_num ( " input " ) ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        string  out  =  kq [ i ] - > args [ kq [ i ] - > get_arg_num ( " output " ) ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        if  ( is_same_size_image ( * ( cl_mem * ) in . data ( ) ,  * ( cl_mem * ) out . data ( ) ) )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          cl_mem  tmp  =  make_image_like ( context ,  * ( cl_mem  * ) in . data ( ) ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          replacements [ in ]  =  string ( ( char  * ) & tmp ,  sizeof ( tmp ) ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          replacements [ out ]  =  string ( ( char  * ) & tmp ,  sizeof ( tmp ) ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          kq . erase ( kq . begin ( ) + i ) ;  - - i ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        } 
 
							 
						 
					
						
							
								
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								      } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      // NOTE: if activations/accumulation are done in the wrong order, this will be wrong
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      // fuse activations into convs and fc_Wtx
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      // saves ~1.5 ms
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      // NOTE: this changes the outputs because of rounding, should be better now!
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      if  ( i  ! =  0  & &  kq [ i ] - > name  = =  " activate_image " )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        if  ( kq [ i - 1 ] - > name  = =  " convolution_horizontal_reduced_reads_1x1 "  | | 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            kq [ i - 1 ] - > name  = =  " convolution_horizontal_reduced_reads_5_outputs "  | | 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            kq [ i - 1 ] - > name  = =  " convolution_horizontal_reduced_reads "  | | 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            kq [ i - 1 ] - > name  = =  " convolution_horizontal_reduced_reads_depthwise "  | | 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            kq [ i - 1 ] - > name  = =  " convolution_horizontal_reduced_reads_depthwise_stride_1 "  | | 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            kq [ i - 1 ] - > name  = =  " fc_Wtx " )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          string  lastout  =  kq [ i - 1 ] - > args [ kq [ i - 1 ] - > get_arg_num ( " output " ) ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          string  in  =  kq [ i ] - > args [ kq [ i ] - > get_arg_num ( " input " ) ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          string  out  =  kq [ i ] - > args [ kq [ i ] - > get_arg_num ( " output " ) ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          if  ( lastout  = =  in )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            short  neuron  =  * ( int * ) kq [ i ] - > args [ kq [ i ] - > get_arg_num ( " neuron " ) ] . data ( ) ; 
 
							 
						 
					
						
							
								
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								            assert ( neuron  < =  5 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            // ELU isn't supported in fc_Wtx
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            assert ( ! ( kq [ i - 1 ] - > name  = =  " fc_Wtx "  & &  neuron  = =  5 ) ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            kq [ i - 1 ] - > args [ kq [ i - 1 ] - > get_arg_num ( " neuron " ) ]  =  string ( ( char  * ) & neuron ,  sizeof ( neuron ) ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            cl_mem  tmp  =  make_image_like ( context ,  * ( cl_mem  * ) lastout . data ( ) ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            replacements [ in ]  =  string ( ( char  * ) & tmp ,  sizeof ( tmp ) ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            replacements [ out ]  =  string ( ( char  * ) & tmp ,  sizeof ( tmp ) ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            kq . erase ( kq . begin ( ) + i ) ;  - - i ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      // fuse accumulation into convs and fc_Wtx
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      if  ( i  ! =  0  & &  kq [ i ] - > name  = =  " elementwise_sum " )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        if  ( kq [ i - 1 ] - > name  = =  " convolution_horizontal_reduced_reads_1x1 "  | | 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            kq [ i - 1 ] - > name  = =  " fc_Wtx " )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          string  lastout  =  kq [ i - 1 ] - > args [ kq [ i - 1 ] - > get_arg_num ( " output " ) ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          string  a  =  kq [ i ] - > args [ kq [ i ] - > get_arg_num ( " a " ) ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          string  b  =  kq [ i ] - > args [ kq [ i ] - > get_arg_num ( " b " ) ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          string  out  =  kq [ i ] - > args [ kq [ i ] - > get_arg_num ( " output " ) ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          if  ( lastout  = =  a )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            kq [ i - 1 ] - > args [ kq [ i - 1 ] - > get_arg_num ( " accumulator " ) ]  =  b ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          }  else  if  ( lastout  = =  b )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            kq [ i - 1 ] - > args [ kq [ i - 1 ] - > get_arg_num ( " accumulator " ) ]  =  a ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          }  else  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            continue ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          cl_mem  tmp  =  make_image_like ( context ,  * ( cl_mem  * ) lastout . data ( ) ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          replacements [ lastout ]  =  string ( ( char  * ) & tmp ,  sizeof ( tmp ) ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          replacements [ out ]  =  string ( ( char  * ) & tmp ,  sizeof ( tmp ) ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          short  doAccumulate  =  1 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          kq [ i - 1 ] - > args [ kq [ i - 1 ] - > get_arg_num ( " doAccumulate " ) ]  =  string ( ( char  * ) & doAccumulate ,  sizeof ( doAccumulate ) ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          kq . erase ( kq . begin ( ) + i ) ;  - - i ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    // remap inputs and outputs, and clear the kernels
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    for  ( int  i  =  0 ;  i  <  kq . size ( ) ;  i + + )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      kq [ i ] - > kernel  =  NULL ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      for  ( int  j  =  0 ;  j  <  kq [ i ] - > num_args ;  j + + )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        if  ( replacements . find ( kq [ i ] - > args [ j ] )  ! =  replacements . end ( ) )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          kq [ i ] - > args [ j ]  =  replacements [ kq [ i ] - > args [ j ] ] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    printf ( " optimize %lu -> %lu \n " ,  start_size ,  kq . size ( ) ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  }  while  ( kq . size ( )  ! =  start_size ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  size_t  work_group_size  =  0 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  clGetDeviceInfo ( device_id ,  CL_DEVICE_MAX_WORK_GROUP_SIZE ,  sizeof ( work_group_size ) ,  & work_group_size ,  NULL ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  printf ( " max work group size %lu \n " ,  work_group_size ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  // local work group optimizer
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  for  ( auto  & k  :  kq )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    // only do it for convs, since others might share memory
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    if  ( k - > name . rfind ( " convolution_ " ,  0 )  = =  0 )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      int  best  =  - 1 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      if  ( k - > local_work_size [ 0 ]  *  k - > local_work_size [ 1 ]  *  k - > local_work_size [ 2 ]  <  work_group_size / 2 )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        uint64_t  base_time  =  k - > benchmark ( ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        uint64_t  best_time  =  base_time ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        for  ( int  i  =  0 ;  i  <  3 ;  i + + )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          k - > local_work_size [ i ]  * =  2 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          uint64_t  this_time  =  k - > benchmark ( ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          if  ( this_time  <  best_time )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            best  =  i ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								            best_time  =  this_time ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          k - > local_work_size [ i ]  / =  2 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        if  ( best  ! =  - 1 )  { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          k - > local_work_size [ best ]  * =  2 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								          //printf("%s %.2f ms doubled %d to %.2f ms\n", k->name.c_str(), base_time/1e6, best, best_time/1e6);
  
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								        } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  return  0 ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								}