//  const  __constant  float3  rgb_weights  =  ( 0.299 ,  0.587 ,  0.114 ) ; // opencv rgb2gray weights 
 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								//  const  __constant  float3  bgr_weights  =  ( 0.114 ,  0.587 ,  0.299 ) ; // bgr2gray weights 
 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								//  convert  input  rgb  image  to  single  channel  then  conv 
 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								__kernel  void  rgb2gray_conv2d ( 
 
						 
					
						
							
								
							 
							
								
									
										 
								
							 
							
								 
							
							
								  const  __global  uchar  *  input, 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  __global  short  *  output, 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  __constant  short  *  filter, 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  __local  uchar3  *  cached 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								) 
 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								{ 
 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  const  int  rowOffset  =  get_global_id ( 1 )  *  IMAGE_W ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  const  int  my  =  get_global_id ( 0 )  +  rowOffset ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  const  int  localRowLen  =  TWICE_HALF_FILTER_SIZE  +  get_local_size ( 0 ) ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  const  int  localRowOffset  =  (  get_local_id ( 1 )  +  HALF_FILTER_SIZE  )  *  localRowLen ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  const  int  myLocal  =  localRowOffset  +  get_local_id ( 0 )  +  HALF_FILTER_SIZE ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  //  cache  local  pixels 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  cached[  myLocal  ].x  =  input[  my  *  3  ] ; // r 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  cached[  myLocal  ].y  =  input[  my  *  3  +  1] ; // g 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  cached[  myLocal  ].z  =  input[  my  *  3  +  2] ; // b 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  //  pad 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  if  ( 
  
						 
					
						
							
								
							 
							
								
									
										 
								
							 
							
								 
							
							
								    get_global_id ( 0 )  <  HALF_FILTER_SIZE        | | 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    get_global_id ( 0 )  >  IMAGE_W  -  HALF_FILTER_SIZE  -  1    | | 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    get_global_id ( 1 )  <  HALF_FILTER_SIZE      | | 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    get_global_id ( 1 )  >  IMAGE_H  -  HALF_FILTER_SIZE  -  1 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  ) 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  { 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    barrier ( CLK_LOCAL_MEM_FENCE ) ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    return ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  } 
  
						 
					
						
							
								
							 
							
								
									
										 
								
							 
							
								 
							
							
								  else 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  { 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    int  localColOffset  =  -1 ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    int  globalColOffset  =  -1 ; 
  
						 
					
						
							
								
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    //  cache  extra 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    if  (  get_local_id ( 0 )  <  HALF_FILTER_SIZE  ) 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    { 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      localColOffset  =  get_local_id ( 0 ) ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      globalColOffset  =  -HALF_FILTER_SIZE ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      cached[  localRowOffset  +  get_local_id ( 0 )  ].x  =  input[  my  *  3  -  HALF_FILTER_SIZE  *  3  ] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      cached[  localRowOffset  +  get_local_id ( 0 )  ].y  =  input[  my  *  3  -  HALF_FILTER_SIZE  *  3  +  1] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      cached[  localRowOffset  +  get_local_id ( 0 )  ].z  =  input[  my  *  3  -  HALF_FILTER_SIZE  *  3  +  2] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    } 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    else  if  (  get_local_id ( 0 )  >=  get_local_size ( 0 )  -  HALF_FILTER_SIZE  ) 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    { 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      localColOffset  =  get_local_id ( 0 )  +  TWICE_HALF_FILTER_SIZE ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      globalColOffset  =  HALF_FILTER_SIZE ; 
  
						 
					
						
							
								
							 
							
								
									
										 
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      cached[  myLocal  +  HALF_FILTER_SIZE  ].x  =  input[  my  *  3  +  HALF_FILTER_SIZE  *  3  ] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      cached[  myLocal  +  HALF_FILTER_SIZE  ].y  =  input[  my  *  3  +  HALF_FILTER_SIZE  *  3  +  1] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      cached[  myLocal  +  HALF_FILTER_SIZE  ].z  =  input[  my  *  3  +  HALF_FILTER_SIZE  *  3  +  2] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    } 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    if  (  get_local_id ( 1 )  <  HALF_FILTER_SIZE  ) 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    { 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      cached[  get_local_id ( 1 )  *  localRowLen  +  get_local_id ( 0 )  +  HALF_FILTER_SIZE  ].x  =  input[  my  *  3  -  HALF_FILTER_SIZE_IMAGE_W  *  3  ] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      cached[  get_local_id ( 1 )  *  localRowLen  +  get_local_id ( 0 )  +  HALF_FILTER_SIZE  ].y  =  input[  my  *  3  -  HALF_FILTER_SIZE_IMAGE_W  *  3  +  1] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      cached[  get_local_id ( 1 )  *  localRowLen  +  get_local_id ( 0 )  +  HALF_FILTER_SIZE  ].z  =  input[  my  *  3  -  HALF_FILTER_SIZE_IMAGE_W  *  3  +  2] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      if  ( localColOffset  >  0 ) 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      { 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        cached[  get_local_id ( 1 )  *  localRowLen  +  localColOffset  ].x  =  input[  my  *  3  -  HALF_FILTER_SIZE_IMAGE_W  *  3  +  globalColOffset  *  3] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        cached[  get_local_id ( 1 )  *  localRowLen  +  localColOffset  ].y  =  input[  my  *  3  -  HALF_FILTER_SIZE_IMAGE_W  *  3  +  globalColOffset  *  3  +  1] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        cached[  get_local_id ( 1 )  *  localRowLen  +  localColOffset  ].z  =  input[  my  *  3  -  HALF_FILTER_SIZE_IMAGE_W  *  3  +  globalColOffset  *  3  +  2] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      } 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    } 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    else  if  (  get_local_id ( 1 )  >=  get_local_size ( 1 )  -HALF_FILTER_SIZE  ) 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    { 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      int  offset  =  (  get_local_id ( 1 )  +  TWICE_HALF_FILTER_SIZE  )  *  localRowLen ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      cached[  offset  +  get_local_id ( 0 )  +  HALF_FILTER_SIZE  ].x  =  input[  my  *  3  +  HALF_FILTER_SIZE_IMAGE_W  *  3  ] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      cached[  offset  +  get_local_id ( 0 )  +  HALF_FILTER_SIZE  ].y  =  input[  my  *  3  +  HALF_FILTER_SIZE_IMAGE_W  *  3  +  1] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      cached[  offset  +  get_local_id ( 0 )  +  HALF_FILTER_SIZE  ].z  =  input[  my  *  3  +  HALF_FILTER_SIZE_IMAGE_W  *  3  +  2] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      if  ( localColOffset  >  0 ) 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      { 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        cached[  offset  +  localColOffset  ].x  =  input[  my  *  3  +  HALF_FILTER_SIZE_IMAGE_W  *  3  +  globalColOffset  *  3] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        cached[  offset  +  localColOffset  ].y  =  input[  my  *  3  +  HALF_FILTER_SIZE_IMAGE_W  *  3  +  globalColOffset  *  3  +  1] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        cached[  offset  +  localColOffset  ].z  =  input[  my  *  3  +  HALF_FILTER_SIZE_IMAGE_W  *  3  +  globalColOffset  *  3  +  2] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      } 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    } 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    //  sync 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    barrier ( CLK_LOCAL_MEM_FENCE ) ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    //  perform  convolution 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    int  fIndex  =  0 ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    short  sum  =  0 ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    for  ( int  r  =  -HALF_FILTER_SIZE ; r <= HALF_FILTER_SIZE; r++) 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    { 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      int  curRow  =  r  *  localRowLen ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      for  ( int  c  =  -HALF_FILTER_SIZE ; c <= HALF_FILTER_SIZE; c++, fIndex++) 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      { 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        if  ( !FLIP_RB ) { 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								          //  sum  +=  dot ( rgb_weights,  cached[  myLocal  +  curRow  +  c  ] )  *  filter[  fIndex  ] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								          sum  +=  ( cached[  myLocal  +  curRow  +  c  ].x  /  3  +  cached[  myLocal  +  curRow  +  c  ].y  /  2  +  cached[  myLocal  +  curRow  +  c  ].z  /  9 )  *  filter[  fIndex  ] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        }  else  { 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								          //  sum  +=  dot ( bgr_weights,  cached[  myLocal  +  curRow  +  c  ] )  *  filter[  fIndex  ] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								          sum  +=  ( cached[  myLocal  +  curRow  +  c  ].x  /  9  +  cached[  myLocal  +  curRow  +  c  ].y  /  2  +  cached[  myLocal  +  curRow  +  c  ].z  /  3 )  *  filter[  fIndex  ] ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								        } 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								      } 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    } 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								    output[my]  =  sum ; 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								  } 
  
						 
					
						
							
								
							 
							
								
							 
							
								 
							
							
								}