@ -1,12 +1,3 @@ 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					# ifdef  HALF_AS_FLOAT  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					# define  half  float  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					# define  half2  float2  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					# define  half3  float3  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					# define  half4  float4  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					# else  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					# pragma  OPENCL  EXTENSION  cl_khr_fp16  :  enable  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					# endif  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					# define  UV_WIDTH  RGB_WIDTH  /  2  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					# define  UV_HEIGHT  RGB_HEIGHT  /  2  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					# define  U_OFFSET  RGB_WIDTH  *  RGB_HEIGHT  
				
			 
			
		
	
	
		
			
				
					
						
						
						
							
								 
							 
						
					 
				
				 
				 
				
					@ -17,171 +8,129 @@ 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					# define  RGB_TO_V ( r,  g,  b )  ( ( mul24 ( r,  56 )  -  mul24 ( g,  47 )  -  mul24 ( b,  9 )  +  0x8080 )  >>  8 )  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					# define  AVERAGE ( x,  y,  z,  w )  ( ( convert_ushort ( x )  +  convert_ushort ( y )  +  convert_ushort ( z )  +  convert_ushort ( w )  +  1 )  >>  1 )  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					//  post  wb  CCM  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					const  __constant  half3  color_correction_0  =  ( half3 ) ( 1.82717181 ,  -0.31231438 ,  0.07307673 ) ;  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					const  __constant  half3  color_correction_1  =  ( half3 ) ( -0.5743977 ,  1.36858544 ,  -0.53183455 ) ;  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					const  __constant  half3  color_correction_2  =  ( half3 ) ( -0.25277411 ,  -0.05627105 ,  1.45875782 ) ;  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					float3  color_correct ( float3  rgb )  {  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  //  color  correction   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  float3  x  =  rgb.x  *  ( float3 ) ( 1.82717181 ,  -0.31231438 ,  0.07307673 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  x  +=  rgb.y  *  ( float3 ) ( -0.5743977 ,  1.36858544 ,  -0.53183455 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  x  +=  rgb.z  *  ( float3 ) ( -0.25277411 ,  -0.05627105 ,  1.45875782 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					//  tone  mapping  params  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					const  half   gamma_k  =  0.75 ;  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					const  half   gamma_b  =  0.125 ;  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					const  half   mp  =  0.01 ; // ideally midpoint should be adaptive  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					const  half   rk  =  9  -  100*mp ;  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					   //  tone  mapping  params  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float   gamma_k  =  0.75 ;  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float   gamma_b  =  0.125 ;  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float   mp  =  0.01 ; // ideally midpoint should be adaptive  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float   rk  =  9  -  100*mp ;  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					inline  half3  gamma_apply ( half3  x )  {  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  //  poly  approximation  for  s  curve   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  return  ( x  >  mp )  ?   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    ( ( rk  *  ( x-mp )  *  ( 1- ( gamma_k*mp+gamma_b ) )  *  ( 1+1/ ( rk* ( 1-mp ) ) )  /  ( 1+rk* ( x-mp ) ) )  +  gamma_k*mp  +  gamma_b )  :   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    ( ( rk  *  ( x-mp )  *  ( gamma_k*mp+gamma_b )  *  ( 1+1/ ( rk*mp ) )  /  ( 1-rk* ( x-mp ) ) )  +  gamma_k*mp  +  gamma_b ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					}  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					inline  half3  color_correct ( half3  rgb )  {  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  half3  ret  =  ( half ) rgb.x  *  color_correction_0 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  ret  +=  ( half ) rgb.y  *  color_correction_1 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  ret  +=  ( half ) rgb.z  *  color_correction_2 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  return  gamma_apply ( ret ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					}  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					inline  half  get_vignetting_s ( float  r )  {  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					float  get_vignetting_s ( float  r )  {  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  if  ( r  <  62500 )  {   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    return  ( half ) ( 1.0f  +  0.0000008f*r ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    return  ( 1.0f  +  0.0000008f*r ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  }  else  if  ( r  <  490000 )  {   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    return  ( half ) ( 0.9625f  +  0.0000014f*r ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    return  ( 0.9625f  +  0.0000014f*r ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  }  else  if  ( r  <  1102500 )  {   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    return  ( half ) ( 1.26434f  +  0.0000000000016f*r*r ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    return  ( 1.26434f  +  0.0000000000016f*r*r ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  }  else  {   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    return  ( half ) ( 0.53503625f  +  0.0000000000022f*r*r ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    return  ( 0.53503625f  +  0.0000000000022f*r*r ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  }   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					}  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					inline  half  val_from_10 ( const  uchar  *  source,  int  gx,  int  gy,  half  black_level )  {  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  //  parse  12bit   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  int  start  =  gy  *  FRAME_STRIDE  +  ( 3  *  ( gx  /  2 ) )  +  ( FRAME_STRIDE  *  FRAME_OFFSET ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  int  offset  =  gx  %  2 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  uint  major  =  ( uint ) source[start  +  offset]  <<  4 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  uint  minor  =  ( source[start  +  2]  >>  ( 4  *  offset ) )  &  0xf ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  half  pv  =  ( ( half ) ( major  +  minor ) )  /  4.0 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  //  normalize   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  pv  =  max ( ( half ) 0.0 ,  pv  -  black_level ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  pv  /=  ( 1024.0  -  black_level ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  //  correct  vignetting   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  if  ( CAM_NUM  ==  1 )  {  //  fcamera   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    gx  =  ( gx  -  RGB_WIDTH/2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    gy  =  ( gy  -  RGB_HEIGHT/2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    pv  *=  get_vignetting_s ( gx*gx  +  gy*gy ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  }   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  pv  =  clamp ( pv,  ( half ) 0.0 ,  ( half ) 1.0 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  return  pv ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					float4  val4_from_12 ( uchar8  pvs,  float  gain )  {  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  uint4  parsed  =  ( uint4 ) ( ( ( uint ) pvs.s0<<4 )  +  ( pvs.s1>>4 ) ,   //  is  from  the  previous  10  bit   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					                         ( ( uint ) pvs.s2<<4 )  +  ( pvs.s4&0xF ) ,   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					                         ( ( uint ) pvs.s3<<4 )  +  ( pvs.s4>>4 ) ,   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					                         ( ( uint ) pvs.s5<<4 )  +  ( pvs.s7&0xF ) ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  //  normalize  and  scale   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  float4  pv  =  ( convert_float4 ( parsed )  -  168.0 )  /  ( 4096.0  -  168.0 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  return  clamp ( pv*gain,  0.0 ,  1.0 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					}  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					inline  half  get_k ( half  a,  half  b,  half  c,  half   d )  {  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					float  get_k ( float  a,  float  b,  float  c,  float  d )  {  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  return  2.0  -  ( fabs ( a  -  b )  +  fabs ( c  -  d ) ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					}  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					__kernel  void  debayer10 ( const  __global  uchar  *  in,  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					                        __global  uchar  *  out,   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					                        __local  half  *  cached,   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					                        float  black_level   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					                       )   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					__kernel  void  debayer10 ( const  __global  uchar  *  in,  __global  uchar  *  out )  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					{  
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  int  gid_x  =  get_global_id ( 0 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  int  gid_y  =  get_global_id ( 1 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  int  lid_x  =  get_local_id ( 0 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  int  lid_y  =  get_local_id ( 1 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  int  localRowLen  =  mad24 ( get_local_size ( 0 ) ,  2 ,  2 ) ; // 2 padding   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  int  localColLen  =  mad24 ( get_local_size ( 1 ) ,  2 ,  2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  int  x_global  =  mul24 ( gid_x,  2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  int  y_global  =  mul24 ( gid_y,  2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  int  y_top_mod  =  ( gid_y  ==  0 )  ?  2:  0 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  int  y_bot_mod  =  ( gid_y  ==  ( RGB_HEIGHT/2  -  1 ) )  ?  1:  3 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  int  x_local  =  mad24 ( lid_x,  2 ,  1 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  int  y_local  =  mad24 ( lid_y,  2 ,  1 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  int  x_global_mod  =  ( gid_x  ==  0  | |  gid_x  ==  get_global_size ( 0 )  -  1 )  ?  -1:  1 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  int  y_global_mod  =  ( gid_y  ==  0  | |  gid_y  ==  get_global_size ( 1 )  -  1 )  ?  -1:  1 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  float3  rgb ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  uchar3  rgb_out[4] ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  int  localColOffset  =  0 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  int  globalColOffset ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  int  start  =  ( 2  *  gid_y  -  1 )  *  FRAME_STRIDE  +  ( 3  *  gid_x  -  2 )  +  ( FRAME_STRIDE  *  FRAME_OFFSET ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  cached[mad24 ( y_local  +  0 ,  localRowLen,  x_local  +  0 ) ]  =  val_from_10 ( in,  x_global  +  0 ,  y_global  +  0 ,  black_level ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  cached[mad24 ( y_local  +  0 ,  localRowLen,  x_local  +  1 ) ]  =  val_from_10 ( in,  x_global  +  1 ,  y_global  +  0 ,  black_level ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  cached[mad24 ( y_local  +  1 ,  localRowLen,  x_local  +  0 ) ]  =  val_from_10 ( in,  x_global  +  0 ,  y_global  +  1 ,  black_level ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  cached[mad24 ( y_local  +  1 ,  localRowLen,  x_local  +  1 ) ]  =  val_from_10 ( in,  x_global  +  1 ,  y_global  +  1 ,  black_level ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  //  read  in  8x4  chars   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  uchar8  dat[4] ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  dat[0]  =  vload8 ( 0 ,  in  +  start  +  FRAME_STRIDE*y_top_mod ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  dat[1]  =  vload8 ( 0 ,  in  +  start  +  FRAME_STRIDE*1 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  dat[2]  =  vload8 ( 0 ,  in  +  start  +  FRAME_STRIDE*2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  dat[3]  =  vload8 ( 0 ,  in  +  start  +  FRAME_STRIDE*y_bot_mod ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  if  ( lid_x  ==  0 )  {   //  left  edge   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    localColOffset  =  -1 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    globalColOffset  =  -x_global_mod ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    cached[mad24 ( y_local  +  0 ,  localRowLen,  x_local  -  1 ) ]  =  val_from_10 ( in,  x_global  -  x_global_mod,  y_global  +  0 ,  black_level ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    cached[mad24 ( y_local  +  1 ,  localRowLen,  x_local  -  1 ) ]  =  val_from_10 ( in,  x_global  -  x_global_mod,  y_global  +  1 ,  black_level ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  }  else  if  ( lid_x  ==  get_local_size ( 0 )  -  1 )  {   //  right  edge   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    localColOffset  =  2 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    globalColOffset  =  x_global_mod  +  1 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    cached[mad24 ( y_local  +  0 ,  localRowLen,  x_local  +  2 ) ]  =  val_from_10 ( in,  x_global  +  x_global_mod  +  1 ,  y_global  +  0 ,  black_level ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    cached[mad24 ( y_local  +  1 ,  localRowLen,  x_local  +  2 ) ]  =  val_from_10 ( in,  x_global  +  x_global_mod  +  1 ,  y_global  +  1 ,  black_level ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  }   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  if  ( lid_y  ==  0 )  {   //  top  row   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    cached[mad24 ( y_local  -  1 ,  localRowLen,  x_local  +  0 ) ]  =  val_from_10 ( in,  x_global  +  0 ,  y_global  -  y_global_mod,  black_level ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    cached[mad24 ( y_local  -  1 ,  localRowLen,  x_local  +  1 ) ]  =  val_from_10 ( in,  x_global  +  1 ,  y_global  -  y_global_mod,  black_level ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    if  ( localColOffset  !=  0 )  {   //  cache  corners   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					      cached[mad24 ( y_local  -  1 ,  localRowLen,  x_local  +  localColOffset ) ]  =  val_from_10 ( in,  x_global  +  globalColOffset,  y_global  -  y_global_mod,  black_level ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    }   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  }  else  if  ( lid_y  ==  get_local_size ( 1 )  -  1 )  {   //  bottom  row   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    cached[mad24 ( y_local  +  2 ,  localRowLen,  x_local  +  0 ) ]  =  val_from_10 ( in,  x_global  +  0 ,  y_global  +  y_global_mod  +  1 ,  black_level ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    cached[mad24 ( y_local  +  2 ,  localRowLen,  x_local  +  1 ) ]  =  val_from_10 ( in,  x_global  +  1 ,  y_global  +  y_global_mod  +  1 ,  black_level ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    if  ( localColOffset  !=  0 )  {   //  cache  corners   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					      cached[mad24 ( y_local  +  2 ,  localRowLen,  x_local  +  localColOffset ) ]  =  val_from_10 ( in,  x_global  +  globalColOffset,  y_global  +  y_global_mod  +  1 ,  black_level ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    }   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  //  correct  vignetting   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  # if  VIGNETTING   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    int  gx  =  ( gid_x*2  -  RGB_WIDTH/2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    int  gy  =  ( gid_y*2  -  RGB_HEIGHT/2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    const  float  gain  =  get_vignetting_s ( gx*gx  +  gy*gy ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  # else   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    const  float  gain  =  1.0 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  # endif   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  //  process  them  to  floats   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  float4  va  =  val4_from_12 ( dat[0],  gain ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  float4  vb  =  val4_from_12 ( dat[1],  gain ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  float4  vc  =  val4_from_12 ( dat[2],  gain ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  float4  vd  =  val4_from_12 ( dat[3],  gain ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  if  ( gid_x  ==  0 )  {   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    va.s0  =  va.s2 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    vb.s0  =  vb.s2 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    vc.s0  =  vc.s2 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    vd.s0  =  vd.s2 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  }  else  if  ( gid_x  ==  RGB_WIDTH/2  -  1 )  {   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    va.s3  =  va.s1 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    vb.s3  =  vb.s1 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    vc.s3  =  vc.s1 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					    vd.s3  =  vd.s1 ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  }   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  //  sync   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  barrier ( CLK_LOCAL_MEM_FENCE ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  half3  rgb ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  uchar3  rgb_out[4] ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half4  va  =  vload4 ( 0 ,  cached  +  mad24 ( lid_y  *  2  +  0 ,  localRowLen,  lid_x  *  2 ) ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half4  vb  =  vload4 ( 0 ,  cached  +  mad24 ( lid_y  *  2  +  1 ,  localRowLen,  lid_x  *  2 ) ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half4  vc  =  vload4 ( 0 ,  cached  +  mad24 ( lid_y  *  2  +  2 ,  localRowLen,  lid_x  *  2 ) ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half4  vd  =  vload4 ( 0 ,  cached  +  mad24 ( lid_y  *  2  +  3 ,  localRowLen,  lid_x  *  2 ) ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  //  a  simplified  version  of  https://opensignalprocessingjournal.com/contents/volumes/V6/TOSIGPJ-6-1/TOSIGPJ-6-1.pdf   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half   k01  =  get_k ( va.s0,  vb.s1,  va.s2,  vb.s1 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half   k02  =  get_k ( va.s2,  vb.s1,  vc.s2,  vb.s1 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half   k03  =  get_k ( vc.s0,  vb.s1,  vc.s2,  vb.s1 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half   k04  =  get_k ( va.s0,  vb.s1,  vc.s0,  vb.s1 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float  k01  =  get_k ( va.s0,  vb.s1,  va.s2,  vb.s1 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float  k02  =  get_k ( va.s2,  vb.s1,  vc.s2,  vb.s1 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float  k03  =  get_k ( vc.s0,  vb.s1,  vc.s2,  vb.s1 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float  k04  =  get_k ( va.s0,  vb.s1,  vc.s0,  vb.s1 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  rgb.x  =  ( k02*vb.s2+k04*vb.s0 ) / ( k02+k04 ) ; // R_G1   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  rgb.y  =  vb.s1 ; // G1(R)   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  rgb.z  =  ( k01*va.s1+k03*vc.s1 ) / ( k01+k03 ) ; // B_G1   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  rgb_out[0]  =  convert_uchar3_sat ( color_correct ( clamp ( rgb,  0.0 ,  1.0 ) )  *  255.0 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half   k11  =  get_k ( va.s1,  vc.s1,  va.s3,  vc.s3 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half   k12  =  get_k ( va.s2,  vb.s1,  vb.s3,  vc.s2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half   k13  =  get_k ( va.s1,  va.s3,  vc.s1,  vc.s3 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half   k14  =  get_k ( va.s2,  vb.s3,  vc.s2,  vb.s1 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float  k11  =  get_k ( va.s1,  vc.s1,  va.s3,  vc.s3 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float   k12  =  get_k ( va.s2,  vb.s1,  vb.s3,  vc.s2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float   k13  =  get_k ( va.s1,  va.s3,  vc.s1,  vc.s3 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float   k14  =  get_k ( va.s2,  vb.s3,  vc.s2,  vb.s1 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  rgb.x  =  vb.s2 ; // R   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  rgb.y  =  ( k11* ( va.s2+vc.s2 ) *0.5+k13* ( vb.s3+vb.s1 ) *0.5 ) / ( k11+k13 ) ; // G_R   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  rgb.z  =  ( k12* ( va.s3+vc.s1 ) *0.5+k14* ( va.s1+vc.s3 ) *0.5 ) / ( k12+k14 ) ; // B_R   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  rgb_out[1]  =  convert_uchar3_sat ( color_correct ( clamp ( rgb,  0.0 ,  1.0 ) )  *  255.0 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half   k21  =  get_k ( vb.s0,  vd.s0,  vb.s2,  vd.s2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half   k22  =  get_k ( vb.s1,  vc.s0,  vc.s2,  vd.s1 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half   k23  =  get_k ( vb.s0,  vb.s2,  vd.s0,  vd.s2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half   k24  =  get_k ( vb.s1,  vc.s2,  vd.s1,  vc.s0 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float   k21  =  get_k ( vb.s0,  vd.s0,  vb.s2,  vd.s2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float   k22  =  get_k ( vb.s1,  vc.s0,  vc.s2,  vd.s1 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float   k23  =  get_k ( vb.s0,  vb.s2,  vd.s0,  vd.s2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float   k24  =  get_k ( vb.s1,  vc.s2,  vd.s1,  vc.s0 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  rgb.x  =  ( k22* ( vb.s2+vd.s0 ) *0.5+k24* ( vb.s0+vd.s2 ) *0.5 ) / ( k22+k24 ) ; // R_B   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  rgb.y  =  ( k21* ( vb.s1+vd.s1 ) *0.5+k23* ( vc.s2+vc.s0 ) *0.5 ) / ( k21+k23 ) ; // G_B   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  rgb.z  =  vc.s1 ; // B   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  rgb_out[2]  =  convert_uchar3_sat ( color_correct ( clamp ( rgb,  0.0 ,  1.0 ) )  *  255.0 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					
 
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half   k31  =  get_k ( vb.s1,  vc.s2,  vb.s3,  vc.s2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half   k32  =  get_k ( vb.s3,  vc.s2,  vd.s3,  vc.s2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half   k33  =  get_k ( vd.s1,  vc.s2,  vd.s3,  vc.s2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  half   k34  =  get_k ( vb.s1,  vc.s2,  vd.s1,  vc.s2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float   k31  =  get_k ( vb.s1,  vc.s2,  vb.s3,  vc.s2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float   k32  =  get_k ( vb.s3,  vc.s2,  vd.s3,  vc.s2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float   k33  =  get_k ( vd.s1,  vc.s2,  vd.s3,  vc.s2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  const  float   k34  =  get_k ( vb.s1,  vc.s2,  vd.s1,  vc.s2 ) ;   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  rgb.x  =  ( k31*vb.s2+k33*vd.s2 ) / ( k31+k33 ) ; // R_G2   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  rgb.y  =  vc.s2 ; // G2(B)   
				
			 
			
		
	
		
			
				
					 
					 
				
				 
				 
				
					  rgb.z  =  ( k32*vc.s3+k34*vc.s1 ) / ( k32+k34 ) ; // B_G2