|  |  |  | // const __constant float3 rgb_weights = (0.299, 0.587, 0.114); // opencv rgb2gray weights
 | 
					
						
							|  |  |  | // const __constant float3 bgr_weights = (0.114, 0.587, 0.299); // bgr2gray weights
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | // convert input rgb image to single channel then conv
 | 
					
						
							|  |  |  | __kernel void rgb2gray_conv2d(
 | 
					
						
							|  |  |  |   const __global uchar * input,
 | 
					
						
							|  |  |  |   __global short * output,
 | 
					
						
							|  |  |  |   __constant short * filter,
 | 
					
						
							|  |  |  |   __local uchar3 * cached
 | 
					
						
							|  |  |  | )
 | 
					
						
							|  |  |  | {
 | 
					
						
							|  |  |  |   const int rowOffset = get_global_id(1) * IMAGE_W;
 | 
					
						
							|  |  |  |   const int my = get_global_id(0) + rowOffset;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   const int localRowLen = TWICE_HALF_FILTER_SIZE + get_local_size(0);
 | 
					
						
							|  |  |  |   const int localRowOffset = ( get_local_id(1) + HALF_FILTER_SIZE ) * localRowLen;
 | 
					
						
							|  |  |  |   const int myLocal = localRowOffset + get_local_id(0) + HALF_FILTER_SIZE;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   // cache local pixels
 | 
					
						
							|  |  |  |   cached[ myLocal ].x = input[ my * 3 ]; // r
 | 
					
						
							|  |  |  |   cached[ myLocal ].y = input[ my * 3 + 1]; // g
 | 
					
						
							|  |  |  |   cached[ myLocal ].z = input[ my * 3 + 2]; // b
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |   // pad
 | 
					
						
							|  |  |  |   if (
 | 
					
						
							|  |  |  |     get_global_id(0) < HALF_FILTER_SIZE       ||
 | 
					
						
							|  |  |  |     get_global_id(0) > IMAGE_W - HALF_FILTER_SIZE - 1   ||
 | 
					
						
							|  |  |  |     get_global_id(1) < HALF_FILTER_SIZE     ||
 | 
					
						
							|  |  |  |     get_global_id(1) > IMAGE_H - HALF_FILTER_SIZE - 1
 | 
					
						
							|  |  |  |   )
 | 
					
						
							|  |  |  |   {
 | 
					
						
							|  |  |  |     barrier(CLK_LOCAL_MEM_FENCE);
 | 
					
						
							|  |  |  |     return;
 | 
					
						
							|  |  |  |   }
 | 
					
						
							|  |  |  |   else
 | 
					
						
							|  |  |  |   {
 | 
					
						
							|  |  |  |     int localColOffset = -1;
 | 
					
						
							|  |  |  |     int globalColOffset = -1;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     // cache extra
 | 
					
						
							|  |  |  |     if ( get_local_id(0) < HALF_FILTER_SIZE )
 | 
					
						
							|  |  |  |     {
 | 
					
						
							|  |  |  |       localColOffset = get_local_id(0);
 | 
					
						
							|  |  |  |       globalColOffset = -HALF_FILTER_SIZE;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       cached[ localRowOffset + get_local_id(0) ].x = input[ my * 3 - HALF_FILTER_SIZE * 3 ];
 | 
					
						
							|  |  |  |       cached[ localRowOffset + get_local_id(0) ].y = input[ my * 3 - HALF_FILTER_SIZE * 3 + 1];
 | 
					
						
							|  |  |  |       cached[ localRowOffset + get_local_id(0) ].z = input[ my * 3 - HALF_FILTER_SIZE * 3 + 2];
 | 
					
						
							|  |  |  |     }
 | 
					
						
							|  |  |  |     else if ( get_local_id(0) >= get_local_size(0) - HALF_FILTER_SIZE )
 | 
					
						
							|  |  |  |     {
 | 
					
						
							|  |  |  |       localColOffset = get_local_id(0) + TWICE_HALF_FILTER_SIZE;
 | 
					
						
							|  |  |  |       globalColOffset = HALF_FILTER_SIZE;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |       cached[ myLocal + HALF_FILTER_SIZE ].x = input[ my * 3 + HALF_FILTER_SIZE * 3 ];
 | 
					
						
							|  |  |  |       cached[ myLocal + HALF_FILTER_SIZE ].y = input[ my * 3 + HALF_FILTER_SIZE * 3 + 1];
 | 
					
						
							|  |  |  |       cached[ myLocal + HALF_FILTER_SIZE ].z = input[ my * 3 + HALF_FILTER_SIZE * 3 + 2];
 | 
					
						
							|  |  |  |     }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     if ( get_local_id(1) < HALF_FILTER_SIZE )
 | 
					
						
							|  |  |  |     {
 | 
					
						
							|  |  |  |       cached[ get_local_id(1) * localRowLen + get_local_id(0) + HALF_FILTER_SIZE ].x = input[ my * 3 - HALF_FILTER_SIZE_IMAGE_W * 3 ];
 | 
					
						
							|  |  |  |       cached[ get_local_id(1) * localRowLen + get_local_id(0) + HALF_FILTER_SIZE ].y = input[ my * 3 - HALF_FILTER_SIZE_IMAGE_W * 3 + 1];
 | 
					
						
							|  |  |  |       cached[ get_local_id(1) * localRowLen + get_local_id(0) + HALF_FILTER_SIZE ].z = input[ my * 3 - HALF_FILTER_SIZE_IMAGE_W * 3 + 2];
 | 
					
						
							|  |  |  |       if (localColOffset > 0)
 | 
					
						
							|  |  |  |       {
 | 
					
						
							|  |  |  |         cached[ get_local_id(1) * localRowLen + localColOffset ].x = input[ my * 3 - HALF_FILTER_SIZE_IMAGE_W * 3 + globalColOffset * 3];
 | 
					
						
							|  |  |  |         cached[ get_local_id(1) * localRowLen + localColOffset ].y = input[ my * 3 - HALF_FILTER_SIZE_IMAGE_W * 3 + globalColOffset * 3 + 1];
 | 
					
						
							|  |  |  |         cached[ get_local_id(1) * localRowLen + localColOffset ].z = input[ my * 3 - HALF_FILTER_SIZE_IMAGE_W * 3 + globalColOffset * 3 + 2];
 | 
					
						
							|  |  |  |       }
 | 
					
						
							|  |  |  |     }
 | 
					
						
							|  |  |  |     else if ( get_local_id(1) >= get_local_size(1) -HALF_FILTER_SIZE )
 | 
					
						
							|  |  |  |     {
 | 
					
						
							|  |  |  |       int offset = ( get_local_id(1) + TWICE_HALF_FILTER_SIZE ) * localRowLen;
 | 
					
						
							|  |  |  |       cached[ offset + get_local_id(0) + HALF_FILTER_SIZE ].x = input[ my * 3 + HALF_FILTER_SIZE_IMAGE_W * 3 ];
 | 
					
						
							|  |  |  |       cached[ offset + get_local_id(0) + HALF_FILTER_SIZE ].y = input[ my * 3 + HALF_FILTER_SIZE_IMAGE_W * 3 + 1];
 | 
					
						
							|  |  |  |       cached[ offset + get_local_id(0) + HALF_FILTER_SIZE ].z = input[ my * 3 + HALF_FILTER_SIZE_IMAGE_W * 3 + 2];
 | 
					
						
							|  |  |  |       if (localColOffset > 0)
 | 
					
						
							|  |  |  |       {
 | 
					
						
							|  |  |  |         cached[ offset + localColOffset ].x = input[ my * 3 + HALF_FILTER_SIZE_IMAGE_W * 3 + globalColOffset * 3];
 | 
					
						
							|  |  |  |         cached[ offset + localColOffset ].y = input[ my * 3 + HALF_FILTER_SIZE_IMAGE_W * 3 + globalColOffset * 3 + 1];
 | 
					
						
							|  |  |  |         cached[ offset + localColOffset ].z = input[ my * 3 + HALF_FILTER_SIZE_IMAGE_W * 3 + globalColOffset * 3 + 2];
 | 
					
						
							|  |  |  |       }
 | 
					
						
							|  |  |  |     }
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     // sync
 | 
					
						
							|  |  |  |     barrier(CLK_LOCAL_MEM_FENCE);
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     // perform convolution
 | 
					
						
							|  |  |  |     int fIndex = 0;
 | 
					
						
							|  |  |  |     short sum = 0;
 | 
					
						
							|  |  |  | 
 | 
					
						
							|  |  |  |     for (int r = -HALF_FILTER_SIZE; r <= HALF_FILTER_SIZE; r++)
 | 
					
						
							|  |  |  |     {
 | 
					
						
							|  |  |  |       int curRow = r * localRowLen;
 | 
					
						
							|  |  |  |       for (int c = -HALF_FILTER_SIZE; c <= HALF_FILTER_SIZE; c++, fIndex++)
 | 
					
						
							|  |  |  |       {
 | 
					
						
							|  |  |  |         if (!FLIP_RB){
 | 
					
						
							|  |  |  |           // sum += dot(rgb_weights, cached[ myLocal + curRow + c ]) * filter[ fIndex ];
 | 
					
						
							|  |  |  |           sum += (cached[ myLocal + curRow + c ].x / 3 + cached[ myLocal + curRow + c ].y / 2 + cached[ myLocal + curRow + c ].z / 9) * filter[ fIndex ];
 | 
					
						
							|  |  |  |         } else {
 | 
					
						
							|  |  |  |           // sum += dot(bgr_weights, cached[ myLocal + curRow + c ]) * filter[ fIndex ];
 | 
					
						
							|  |  |  |           sum += (cached[ myLocal + curRow + c ].x / 9 + cached[ myLocal + curRow + c ].y / 2 + cached[ myLocal + curRow + c ].z / 3) * filter[ fIndex ];
 | 
					
						
							|  |  |  |         }
 | 
					
						
							|  |  |  |       }
 | 
					
						
							|  |  |  |     }
 | 
					
						
							|  |  |  |     output[my] = sum;
 | 
					
						
							|  |  |  |   }
 | 
					
						
							|  |  |  | }
 |