@ -1,10 +1,22 @@ 
			
		
	
		
			
				
					# ifdef  HALF_AS_FLOAT  
			
		
	
		
			
				
					# define  half  float  
			
		
	
		
			
				
					# define  half2  float2  
			
		
	
		
			
				
					# define  half3  float3  
			
		
	
		
			
				
					# define  half4  float4  
			
		
	
		
			
				
					# else  
			
		
	
		
			
				
					# pragma  OPENCL  EXTENSION  cl_khr_fp16  :  enable  
			
		
	
		
			
				
					# endif  
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					# define  UV_WIDTH  RGB_WIDTH  /  2  
			
		
	
		
			
				
					# define  UV_HEIGHT  RGB_HEIGHT  /  2  
			
		
	
		
			
				
					# define  U_OFFSET  RGB_WIDTH  *  RGB_HEIGHT  
			
		
	
		
			
				
					# define  V_OFFSET  RGB_WIDTH  *  RGB_HEIGHT  +  UV_WIDTH  *  UV_HEIGHT  
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					# define  RGB_TO_Y ( r,  g,  b )  ( ( ( ( mul24 ( b,  13 )  +  mul24 ( g,  65 )  +  mul24 ( r,  33 ) )  +  64 )  >>  7 )  +  16 )  
			
		
	
		
			
				
					# define  RGB_TO_U ( r,  g,  b )  ( ( mul24 ( b,  56 )  -  mul24 ( g,  37 )  -  mul24 ( r,  19 )  +  0x8080 )  >>  8 )  
			
		
	
		
			
				
					# define  RGB_TO_V ( r,  g,  b )  ( ( mul24 ( r,  56 )  -  mul24 ( g,  47 )  -  mul24 ( b,  9 )  +  0x8080 )  >>  8 )  
			
		
	
		
			
				
					# define  AVERAGE ( x,  y,  z,  w )  ( ( convert_ushort ( x )  +  convert_ushort ( y )  +  convert_ushort ( z )  +  convert_ushort ( w )  +  1 )  >>  1 )  
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					//  post  wb  CCM  
			
		
	
		
			
				
					const  __constant  half3  color_correction_0  =  ( half3 ) ( 1.82717181 ,  -0.31231438 ,  0.07307673 ) ;  
			
		
	
		
			
				
					const  __constant  half3  color_correction_1  =  ( half3 ) ( -0.5743977 ,  1.36858544 ,  -0.53183455 ) ;  
			
		
	
	
		
			
				
					
						
							
								 
						
						
							
								 
						
						
					 
				
				@ -75,115 +87,122 @@ __kernel void debayer10(const __global uchar * in, 
			
		
	
		
			
				
					                        float  black_level   
			
		
	
		
			
				
					                       )   
			
		
	
		
			
				
					{  
			
		
	
		
			
				
					  const  int  x_global  =  get_global_id ( 0 ) ;   
			
		
	
		
			
				
					  const  int  y_global  =  get_global_id ( 1 ) ;   
			
		
	
		
			
				
					  const  int  gid_x  =  get_global_id ( 0 ) ;   
			
		
	
		
			
				
					  const  int  gid_y  =  get_global_id ( 1 ) ;   
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					  const  int  lid_x  =  get_local_id ( 0 ) ;   
			
		
	
		
			
				
					  const  int  lid_y  =  get_local_id ( 1 ) ;   
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					  const  int  x_local  =  get_local_id ( 0 ) ;   
			
		
	
		
			
				
					  const  int  y_local  =  get_local_id ( 1 ) ;   
			
		
	
		
			
				
					  const  int  localRowLen   =  mad24 ( get_local_size ( 0 ) ,  2 ,  2 )  // 2 padding   
			
		
	
		
			
				
					  const  int  localColLen   =  mad24 ( get_local_size ( 1 ) ,  2 ,  2 ) ;   
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					  const  int  localRowLen  =  2  +  get_local_size ( 0 ) ; // 2 padding   
			
		
	
		
			
				
					  const  int  localColLen  =  2  +  get_local_size ( 1 ) ;   
			
		
	
		
			
				
					  const  int  x_global  =  mul24 ( gid_x,  2 ) ;   
			
		
	
		
			
				
					  const  int  y_global  =  mul24 ( gid_y,  2 ) ;   
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					  const  int  localOffset  =  ( y_local  +  1 )  *  localRowLen  +  x_local  +  1 ;   
			
		
	
		
			
				
					  const  int  x_local  =  mad24 ( lid_x,  2 ,  1 ) ;   
			
		
	
		
			
				
					  const  int  y_local  =  mad24 ( lid_y,  2 ,  1 ) ;   
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					  int  out_idx  =  3  *  x_global  +  3  *  y_global  *  RGB_WIDTH ;   
			
		
	
		
			
				
					  const  int  x_global_mod  =  ( gid_x  ==  0  | |  gid_x  ==  get_global_size ( 0 )  -  1 )  ?  -1:  1 ;   
			
		
	
		
			
				
					  const  int  y_global_mod  =  ( gid_y  ==  0  | |  gid_y  ==  get_global_size ( 1 )  -  1 )  ?  -1:  1 ;   
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					  //  cache  padding   
			
		
	
		
			
				
					  int  localColOffset  =  -1 ;   
			
		
	
		
			
				
					  int  localColOffset  =  0 ;   
			
		
	
		
			
				
					  int  globalColOffset ;   
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					  const  int  x_global_mod  =  ( x_global  ==  0  | |  x_global  ==  RGB_WIDTH  -  1 )  ?  -1:  1  ;   
			
		
	
		
			
				
					  const  int  y_global_mod  =  ( y_global  ==  0  | |  y_global  ==  RGB_HEIGHT  -  1 )  ?  -1:  1  ;   
			
		
	
		
			
				
					
  
			
		
	
		
			
				
					  half   pv =  val_from_10 ( in,  x_global,  y_global,  black_level ) ;   
			
		
	
		
			
				
					  cached[localOffset]  =  pv ;   
			
		
	
		
			
				
					
  
			
		
	
		
			
				
					  //  cache  padding   
			
		
	
		
			
				
					  if  ( x_local  <  1 )  {   
			
		
	
		
			
				
					    localColOffset =  x_local  ;   
			
		
	
		
			
				
					    globalColOffset  =  -1 ;   
			
		
	
		
			
				
					    cached[ ( y_local  +  1 )  *  localRowLen  +  x_local]  =  val_from_10 ( in,  x_global-x_global_mod,  y_global,  black_level ) ;    
			
		
	
		
			
				
					  }  else  if  ( x_local  >=  get_local_size ( 0 )  -  1 )  {   
			
		
	
		
			
				
					    localColOffset  =  x_local  +  2 ;   
			
		
	
		
			
				
					    globalColOffset  =  1 ;   
			
		
	
		
			
				
					    cached[localOffset  +  1 ]  =  val_from_10 ( in,  x_global+x_global_mod,  y_global,  black_level ) ;   
			
		
	
		
			
				
					  cached[mad24 ( y_local  +  0 ,  localRowLen,  x_local  +  0 ) ]  =  val_from_10 ( in,  x_global  +  0 ,  y_global  +  0 ,  black_level )  ;   
			
		
	
		
			
				
					  cached[mad24 ( y_local  +  0 ,  localRowLen,  x_local  +  1 ) ]  =  val_from_10 ( in,  x_global  +  1 ,  y_global  +  0 ,  black_level )  ;   
			
		
	
		
			
				
					  cached[mad24 ( y_local  +  1 ,  localRowLen,  x_local  +  0 ) ]  =  val_from_10 ( in,  x_global  +  0 ,  y_global  +  1 ,  black_level ) ;   
			
		
	
		
			
				
					  cac hed[m ad24 ( y_local  +  1 , localRowLen,  x_local  +  1 ) ] =  val_from_10 ( in,  x_global  +  1  ,  y_global  +  1  ,  black_level ) ;   
			
		
	
		
			
				
					
  
			
		
	
		
			
				
					  if  ( lid_x  ==  0 )  {   //  left  edge   
			
		
	
		
			
				
					    localColOffset  =  -1 ;    
			
		
	
		
			
				
					    globalColOffset  =  -x_global_mod ;    
			
		
	
		
			
				
					    cached[mad24 ( y_local  +  0 ,  localRowLen,  x_local  -  1 ) ] =  val_from_10 ( in,  x_g lobal  -  x_global_mod,  y_global  +  0 ,  black_level )  ;   
			
		
	
		
			
				
					    cached[mad24 ( y_local  +  1 ,  localRowLen,  x_local  -  1 ) ]  =  val_from_10 ( in,  x_global  -  x_global_mod,  y_global  +  1 ,  black_level ) ;   
			
		
	
		
			
				
					  }  else  if  ( lid_x  ==  get_local_size ( 0 )  -  1 )  {   //  right  edge   
			
		
	
		
			
				
					    localColOffset  =  2 ;    
			
		
	
		
			
				
					    globalColOffset  =  x_global_mod  +  1 ;   
			
		
	
		
			
				
					    cached[mad24 ( y_local  +  0 ,  localRowLen,  x_local  +  2 ) ]  =  val_from_10 ( in,  x_global  +  x_global_mod  +  1 ,  y_global  +  0 ,  black_level ) ;   
			
		
	
		
			
				
					    cached[mad24 ( y_local  +  1 ,  localRowLen,  x_local  +  2 )  ]  =  val_from_10 ( in,  x_global   +   x_global_mod  +  1  ,  y_global  +  1  ,  black_level ) ;   
			
		
	
		
			
				
					  }   
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					  if  ( y_local  <  1 )  {   
			
		
	
		
			
				
					    cached[y_local  *  localRowLen  +  x_local  +  1]  =  val_from_10 ( in,  x_global,  y_global-y_global_mod,  black_level ) ;   
			
		
	
		
			
				
					    if  ( localColOffset  !=  -1 )  {   
			
		
	
		
			
				
					      cached[y_local  *  localRowLen  +  localColOffset]  =  val_from_10 ( in,  x_global+ ( x_global_mod*globalColOffset ) ,  y_global-y_global_mod,  black_level ) ;   
			
		
	
		
			
				
					  if  ( lid_y  ==  0 )  {   //  top  row   
			
		
	
		
			
				
					    cached[mad24 ( y_local  -  1 ,  localRowLen,  x_local  +  0 ) ]  =  val_from_10 ( in,  x_global  +  0 ,  y_global  -  y_global_mod,  black_level ) ;   
			
		
	
		
			
				
					    cached[mad24 ( y_local  -  1 ,  localRowLen,  x_local  +  1 ) ]  =  val_from_10 ( in,  x_global  +  1 ,  y_global  -  y_global_mod,  black_level ) ;   
			
		
	
		
			
				
					    if  ( localColOffset  !=  0 )  {   //  cache  corners   
			
		
	
		
			
				
					      cached[mad24 ( y_local  -  1 ,  localRowLen,  x_local  +  localColOffset ) ]  =  val_from_10 ( in,  x_global  +  globalColOffset,  y_global  -  y_global_mod,  black_level ) ;   
			
		
	
		
			
				
					    }   
			
		
	
		
			
				
					  }  else  if  ( y_local  >=  get_local_size ( 1 )  -  1 )  {   
			
		
	
		
			
				
					    cached[ ( y_local  +  2 )  *  localRowLen  +  x_local  +  1]  =  val_from_10 ( in,  x_global,  y_global+y_global_mod,  black_level ) ;   
			
		
	
		
			
				
					    if  ( localColOffset  !=  -1 )  {   
			
		
	
		
			
				
					      cached[ ( y_local  +  2 )  *  localRowLen  +  localColOffset]  =  val_from_10 ( in,  x_global+ ( x_global_mod*globalColOffset ) ,  y_global+y_global_mod,  black_level ) ;   
			
		
	
		
			
				
					  }  else  if  ( lid_y  ==  get_local_size ( 1 )  -  1 )  {   //  bottom  row   
			
		
	
		
			
				
					    cached[mad24 ( y_local  +  2 ,  localRowLen,  x_local  +  0 ) ]  =  val_from_10 ( in,  x_global  +  0 ,  y_global  +  y_global_mod  +  1 ,  black_level ) ;   
			
		
	
		
			
				
					    cached[mad24 ( y_local  +  2 ,  localRowLen,  x_local  +  1 ) ]  =  val_from_10 ( in,  x_global  +  1 ,  y_global  +  y_global_mod  +  1 ,  black_level ) ;   
			
		
	
		
			
				
					    if  ( localColOffset  !=  0 )  {   //  cache  corners   
			
		
	
		
			
				
					      cached[mad24 ( y_local  +  2 ,  localRowLen,  x_local  +  localColOffset ) ]  =  val_from_10 ( in,  x_global  +  globalColOffset,  y_global  +  y_global_mod  +  1 ,  black_level ) ;   
			
		
	
		
			
				
					    }   
			
		
	
		
			
				
					  }   
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					  //  sync   
			
		
	
		
			
				
					  barrier ( CLK_LOCAL_MEM_FENCE ) ;   
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					  half  d1  =  cached[localOffset  -  localRowLen  -  1] ;   
			
		
	
		
			
				
					  half  d2  =  cached[localOffset  -  localRowLen  +  1] ;   
			
		
	
		
			
				
					  half  d3  =  cached[localOffset  +  localRowLen  -  1] ;   
			
		
	
		
			
				
					  half  d4  =  cached[localOffset  +  localRowLen  +  1] ;   
			
		
	
		
			
				
					  half  n1  =  cached[localOffset  -  localRowLen] ;   
			
		
	
		
			
				
					  half  n2  =  cached[localOffset  +  1] ;   
			
		
	
		
			
				
					  half  n3  =  cached[localOffset  +  localRowLen] ;   
			
		
	
		
			
				
					  half  n4  =  cached[localOffset  -  1] ;   
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					  half3  rgb ;   
			
		
	
		
			
				
					  uchar3  rgb_out[4] ;   
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					  //  a  simplified  version  of  https://opensignalprocessingjournal.com/contents/volumes/V6/TOSIGPJ-6-1/TOSIGPJ-6-1.pdf   
			
		
	
		
			
				
					  if  ( x_global  %  2  ==  0 )  {   
			
		
	
		
			
				
					    if  ( y_global  %  2  ==  0 )  {   
			
		
	
		
			
				
					      rgb.y  =  pv ; // G1(R)   
			
		
	
		
			
				
					      half  k1  =  get_k ( d1,  pv,  d2,  pv ) ;   
			
		
	
		
			
				
					      half  k2  =  get_k ( d2,  pv,  d4,  pv ) ;   
			
		
	
		
			
				
					      half  k3  =  get_k ( d3,  pv,  d4,  pv ) ;   
			
		
	
		
			
				
					      half  k4  =  get_k ( d1,  pv,  d3,  pv ) ;   
			
		
	
		
			
				
					      //  R_G1   
			
		
	
		
			
				
					      rgb.x  =  ( k2*n2+k4*n4 ) / ( k2+k4 ) ;   
			
		
	
		
			
				
					      //  B_G1   
			
		
	
		
			
				
					      rgb.z  =  ( k1*n1+k3*n3 ) / ( k1+k3 ) ;   
			
		
	
		
			
				
					    }  else  {   
			
		
	
		
			
				
					      rgb.z  =  pv ; // B   
			
		
	
		
			
				
					      half  k1  =  get_k ( d1,  d3,  d2,  d4 ) ;   
			
		
	
		
			
				
					      half  k2  =  get_k ( n1,  n4,  n2,  n3 ) ;   
			
		
	
		
			
				
					      half  k3  =  get_k ( d1,  d2,  d3,  d4 ) ;   
			
		
	
		
			
				
					      half  k4  =  get_k ( n1,  n2,  n3,  n4 ) ;   
			
		
	
		
			
				
					      //  G_B   
			
		
	
		
			
				
					      rgb.y  =  ( k1* ( n1+n3 ) *0.5+k3* ( n2+n4 ) *0.5 ) / ( k1+k3 ) ;   
			
		
	
		
			
				
					      //  R_B   
			
		
	
		
			
				
					      rgb.x  =  ( k2* ( d2+d3 ) *0.5+k4* ( d1+d4 ) *0.5 ) / ( k2+k4 ) ;   
			
		
	
		
			
				
					    }   
			
		
	
		
			
				
					  }  else  {   
			
		
	
		
			
				
					    if  ( y_global  %  2  ==  0 )  {   
			
		
	
		
			
				
					      rgb.x  =  pv ; // R   
			
		
	
		
			
				
					      half  k1  =  get_k ( d1,  d3,  d2,  d4 ) ;   
			
		
	
		
			
				
					      half  k2  =  get_k ( n1,  n4,  n2,  n3 ) ;   
			
		
	
		
			
				
					      half  k3  =  get_k ( d1,  d2,  d3,  d4 ) ;   
			
		
	
		
			
				
					      half  k4  =  get_k ( n1,  n2,  n3,  n4 ) ;   
			
		
	
		
			
				
					      //  G_R   
			
		
	
		
			
				
					      rgb.y  =  ( k1* ( n1+n3 ) *0.5+k3* ( n2+n4 ) *0.5 ) / ( k1+k3 ) ;   
			
		
	
		
			
				
					      //  B_R   
			
		
	
		
			
				
					      rgb.z  =  ( k2* ( d2+d3 ) *0.5+k4* ( d1+d4 ) *0.5 ) / ( k2+k4 ) ;   
			
		
	
		
			
				
					    }  else  {   
			
		
	
		
			
				
					      rgb.y  =  pv ; // G2(B)   
			
		
	
		
			
				
					      half  k1  =  get_k ( d1,  pv,  d2,  pv ) ;   
			
		
	
		
			
				
					      half  k2  =  get_k ( d2,  pv,  d4,  pv ) ;   
			
		
	
		
			
				
					      half  k3  =  get_k ( d3,  pv,  d4,  pv ) ;   
			
		
	
		
			
				
					      half  k4  =  get_k ( d1,  pv,  d3,  pv ) ;   
			
		
	
		
			
				
					      //  R_G2   
			
		
	
		
			
				
					      rgb.x  =  ( k1*n1+k3*n3 ) / ( k1+k3 ) ;   
			
		
	
		
			
				
					      //  B_G2   
			
		
	
		
			
				
					      rgb.z  =  ( k2*n2+k4*n4 ) / ( k2+k4 ) ;   
			
		
	
		
			
				
					    }   
			
		
	
		
			
				
					  }   
			
		
	
		
			
				
					  const  half4  va  =  vload4 ( 0 ,  cached  +  mad24 ( lid_y  *  2  +  0 ,  localRowLen,  lid_x  *  2 ) ) ;   
			
		
	
		
			
				
					  const  half4  vb  =  vload4 ( 0 ,  cached  +  mad24 ( lid_y  *  2  +  1 ,  localRowLen,  lid_x  *  2 ) ) ;   
			
		
	
		
			
				
					  const  half4  vc  =  vload4 ( 0 ,  cached  +  mad24 ( lid_y  *  2  +  2 ,  localRowLen,  lid_x  *  2 ) ) ;   
			
		
	
		
			
				
					  const  half4  vd  =  vload4 ( 0 ,  cached  +  mad24 ( lid_y  *  2  +  3 ,  localRowLen,  lid_x  *  2 ) ) ;   
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					  uchar3  rgbc  =  convert_uchar3_sat ( color_correct ( clamp ( rgb,  ( half ) 0.0 ,  ( half ) 1.0 ) )  *  255.0 ) ;   
			
		
	
		
			
				
					  out[out_idx  +  0]  =  rgbc.z ;   
			
		
	
		
			
				
					  out[out_idx  +  1]  =  rgbc.y ;   
			
		
	
		
			
				
					  out[out_idx  +  2]  =  rgbc.x ;   
			
		
	
		
			
				
					  //  a  simplified  version  of  https://opensignalprocessingjournal.com/contents/volumes/V6/TOSIGPJ-6-1/TOSIGPJ-6-1.pdf   
			
		
	
		
			
				
					  const  half  k01  =  get_k ( va.s0,  vb.s1,  va.s2,  vb.s1 ) ;   
			
		
	
		
			
				
					  const  half  k02  =  get_k ( va.s2,  vb.s1,  vc.s2,  vb.s1 ) ;   
			
		
	
		
			
				
					  const  half  k03  =  get_k ( vc.s0,  vb.s1,  vc.s2,  vb.s1 ) ;   
			
		
	
		
			
				
					  const  half  k04  =  get_k ( va.s0,  vb.s1,  vc.s0,  vb.s1 ) ;   
			
		
	
		
			
				
					  rgb.x  =  ( k02*vb.s2+k04*vb.s0 ) / ( k02+k04 ) ; // R_G1   
			
		
	
		
			
				
					  rgb.y  =  vb.s1 ; // G1(R)   
			
		
	
		
			
				
					  rgb.z  =  ( k01*va.s1+k03*vc.s1 ) / ( k01+k03 ) ; // B_G1   
			
		
	
		
			
				
					  rgb_out[0]  =  convert_uchar3_sat ( color_correct ( clamp ( rgb,  0.0 ,  1.0 ) )  *  255.0 ) ;   
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					  const  half  k11  =  get_k ( va.s1,  vc.s1,  va.s3,  vc.s3 ) ;   
			
		
	
		
			
				
					  const  half  k12  =  get_k ( va.s2,  vb.s1,  vb.s3,  vc.s2 ) ;   
			
		
	
		
			
				
					  const  half  k13  =  get_k ( va.s1,  va.s3,  vc.s1,  vc.s3 ) ;   
			
		
	
		
			
				
					  const  half  k14  =  get_k ( va.s2,  vb.s3,  vc.s2,  vb.s1 ) ;   
			
		
	
		
			
				
					  rgb.x  =  vb.s2 ; // R   
			
		
	
		
			
				
					  rgb.y  =  ( k11* ( va.s2+vc.s2 ) *0.5+k13* ( vb.s3+vb.s1 ) *0.5 ) / ( k11+k13 ) ; // G_R   
			
		
	
		
			
				
					  rgb.z  =  ( k12* ( va.s3+vc.s1 ) *0.5+k14* ( va.s1+vc.s3 ) *0.5 ) / ( k12+k14 ) ; // B_R   
			
		
	
		
			
				
					  rgb_out[1]  =  convert_uchar3_sat ( color_correct ( clamp ( rgb,  0.0 ,  1.0 ) )  *  255.0 ) ;   
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					  const  half  k21  =  get_k ( vb.s0,  vd.s0,  vb.s2,  vd.s2 ) ;   
			
		
	
		
			
				
					  const  half  k22  =  get_k ( vb.s1,  vc.s0,  vc.s2,  vd.s1 ) ;   
			
		
	
		
			
				
					  const  half  k23  =  get_k ( vb.s0,  vb.s2,  vd.s0,  vd.s2 ) ;   
			
		
	
		
			
				
					  const  half  k24  =  get_k ( vb.s1,  vc.s2,  vd.s1,  vc.s0 ) ;   
			
		
	
		
			
				
					  rgb.x  =  ( k22* ( vb.s2+vd.s0 ) *0.5+k24* ( vb.s0+vd.s2 ) *0.5 ) / ( k22+k24 ) ; // R_B   
			
		
	
		
			
				
					  rgb.y  =  ( k21* ( vb.s1+vd.s1 ) *0.5+k23* ( vc.s2+vc.s0 ) *0.5 ) / ( k21+k23 ) ; // G_B   
			
		
	
		
			
				
					  rgb.z  =  vc.s1 ; // B   
			
		
	
		
			
				
					  rgb_out[2]  =  convert_uchar3_sat ( color_correct ( clamp ( rgb,  0.0 ,  1.0 ) )  *  255.0 ) ;   
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					  const  half  k31  =  get_k ( vb.s1,  vc.s2,  vb.s3,  vc.s2 ) ;   
			
		
	
		
			
				
					  const  half  k32  =  get_k ( vb.s3,  vc.s2,  vd.s3,  vc.s2 ) ;   
			
		
	
		
			
				
					  const  half  k33  =  get_k ( vd.s1,  vc.s2,  vd.s3,  vc.s2 ) ;   
			
		
	
		
			
				
					  const  half  k34  =  get_k ( vb.s1,  vc.s2,  vd.s1,  vc.s2 ) ;   
			
		
	
		
			
				
					  rgb.x  =  ( k31*vb.s2+k33*vd.s2 ) / ( k31+k33 ) ; // R_G2   
			
		
	
		
			
				
					  rgb.y  =  vc.s2 ; // G2(B)   
			
		
	
		
			
				
					  rgb.z  =  ( k32*vc.s3+k34*vc.s1 ) / ( k32+k34 ) ; // B_G2   
			
		
	
		
			
				
					  rgb_out[3]  =  convert_uchar3_sat ( color_correct ( clamp ( rgb,  0.0 ,  1.0 ) )  *  255.0 ) ;   
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					  //  write  ys   
			
		
	
		
			
				
					  uchar2  yy  =  ( uchar2 ) (   
			
		
	
		
			
				
					    RGB_TO_Y ( rgb_out[0].s0,  rgb_out[0].s1,  rgb_out[0].s2 ) ,   
			
		
	
		
			
				
					    RGB_TO_Y ( rgb_out[1].s0,  rgb_out[1].s1,  rgb_out[1].s2 )   
			
		
	
		
			
				
					  ) ;   
			
		
	
		
			
				
					  vstore2 ( yy,  0 ,  out  +  mad24 ( gid_y  *  2 ,  RGB_WIDTH,  gid_x  *  2 ) ) ;   
			
		
	
		
			
				
					  yy  =  ( uchar2 ) (   
			
		
	
		
			
				
					    RGB_TO_Y ( rgb_out[2].s0,  rgb_out[2].s1,  rgb_out[2].s2 ) ,   
			
		
	
		
			
				
					    RGB_TO_Y ( rgb_out[3].s0,  rgb_out[3].s1,  rgb_out[3].s2 )   
			
		
	
		
			
				
					  ) ;   
			
		
	
		
			
				
					  vstore2 ( yy,  0 ,  out  +  mad24 ( gid_y  *  2  +  1 ,  RGB_WIDTH,  gid_x  *  2 ) ) ;   
			
		
	
		
			
				
					
 
			
		
	
		
			
				
					  //  write  uvs   
			
		
	
		
			
				
					  const  short  ar  =  AVERAGE ( rgb_out[0].s0,  rgb_out[1].s0,  rgb_out[2].s0,  rgb_out[3].s0 ) ;   
			
		
	
		
			
				
					  const  short  ag  =  AVERAGE ( rgb_out[0].s1,  rgb_out[1].s1,  rgb_out[2].s1,  rgb_out[3].s1 ) ;   
			
		
	
		
			
				
					  const  short  ab  =  AVERAGE ( rgb_out[0].s2,  rgb_out[1].s2,  rgb_out[2].s2,  rgb_out[3].s2 ) ;   
			
		
	
		
			
				
					  out[U_OFFSET  +  mad24 ( gid_y,  UV_WIDTH,  gid_x ) ]  =  RGB_TO_U ( ar,  ag,  ab ) ;   
			
		
	
		
			
				
					  out[V_OFFSET  +  mad24 ( gid_y,  UV_WIDTH,  gid_x ) ]  =  RGB_TO_V ( ar,  ag,  ab ) ;   
			
		
	
		
			
				
					}