//  https://github.com/moskewcz/boda/issues/13 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								//#define  USE_FP16 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# ifdef  USE_FP16 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  # define  xtype  half4 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  # define  read_imagep  read_imageh 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  # define  write_imagep  write_imageh 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# else 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  # define  xtype  float4 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  # define  read_imagep  read_imagef 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  # define  write_imagep  write_imagef 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# endif 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								# pragma  OPENCL  EXTENSION  cl_khr_fp16  :  enable 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								__kernel  void  gemm ( const  int  M,  const  int  N,  const  int  K, 
							 
						 
					
						
							
								
							 
							
								
									
										 
									 
								
							 
							
								 
							 
							
							
								  read_only  image2d_t  A, 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  read_only  image2d_t  B, 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  write_only  image2d_t  C ) 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								{ 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  const  sampler_t  smp  =  CLK_NORMALIZED_COORDS_FALSE  | 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								                        CLK_ADDRESS_CLAMP            | 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								                        CLK_FILTER_NEAREST ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  xtype  c_r[4]  =  {0,0,0,0} ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  xtype  a_r[4],  b_r[4] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  int  const  a_off_thr  =  get_global_id ( 0 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  int  const  b_off_thr  =  get_global_id ( 1 ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  int2  a_samp  =  {0,  a_off_thr} ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  int2  b_samp  =  {0,  b_off_thr} ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  for  ( short  k  =  0 ; k < K/4; k++) { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    for  ( short  i  =  0 ; i < 4; ++i) { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      a_r[i]  =  read_imagep ( A,  smp,  a_samp ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      b_r[i]  =  read_imagep ( B,  smp,  b_samp ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      ++a_samp.x ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      ++b_samp.x ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    for  ( short  i  =  0 ; i < 4; ++i) { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      float4  ov  =  c_r[i] ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      ov.x  +=  a_r[i].x  *  b_r[0].x ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      ov.x  +=  a_r[i].y  *  b_r[0].y ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      ov.x  +=  a_r[i].z  *  b_r[0].z ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      ov.x  +=  a_r[i].w  *  b_r[0].w ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      ov.y  +=  a_r[i].x  *  b_r[1].x ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      ov.y  +=  a_r[i].y  *  b_r[1].y ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      ov.y  +=  a_r[i].z  *  b_r[1].z ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      ov.y  +=  a_r[i].w  *  b_r[1].w ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      ov.z  +=  a_r[i].x  *  b_r[2].x ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      ov.z  +=  a_r[i].y  *  b_r[2].y ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      ov.z  +=  a_r[i].z  *  b_r[2].z ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      ov.z  +=  a_r[i].w  *  b_r[2].w ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      ov.w  +=  a_r[i].x  *  b_r[3].x ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      ov.w  +=  a_r[i].y  *  b_r[3].y ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      ov.w  +=  a_r[i].z  *  b_r[3].z ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      ov.w  +=  a_r[i].w  *  b_r[3].w ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								      c_r[i]  =  ov ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  int2  c_samp  =  {a_off_thr,  b_off_thr*4} ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  for  ( short  i  =  0 ; i < 4; i++) { 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    write_imagep ( C,  c_samp,  c_r[i] ) ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								    ++c_samp.y ; 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								  } 
 
							 
						 
					
						
							
								
							 
							
								
							 
							
								 
							 
							
							
								}