// const __constant float3 rgb_weights = ( 0.299 , 0.587 , 0.114 ) ; // opencv rgb2gray weights
// const __constant float3 bgr_weights = ( 0.114 , 0.587 , 0.299 ) ; // bgr2gray weights
// convert input rgb image to single channel then conv
__kernel void rgb2gray_conv2d (
const __global uchar * input,
__global short * output,
__constant short * filter,
__local uchar3 * cached
)
{
const int rowOffset = get_global_id ( 1 ) * IMAGE_W ;
const int my = get_global_id ( 0 ) + rowOffset ;
const int localRowLen = TWICE_HALF_FILTER_SIZE + get_local_size ( 0 ) ;
const int localRowOffset = ( get_local_id ( 1 ) + HALF_FILTER_SIZE ) * localRowLen ;
const int myLocal = localRowOffset + get_local_id ( 0 ) + HALF_FILTER_SIZE ;
// cache local pixels
cached[ myLocal ].x = input[ my * 3 ] ; // r
cached[ myLocal ].y = input[ my * 3 + 1] ; // g
cached[ myLocal ].z = input[ my * 3 + 2] ; // b
// pad
if (
get_global_id ( 0 ) < HALF_FILTER_SIZE | |
get_global_id ( 0 ) > IMAGE_W - HALF_FILTER_SIZE - 1 | |
get_global_id ( 1 ) < HALF_FILTER_SIZE | |
get_global_id ( 1 ) > IMAGE_H - HALF_FILTER_SIZE - 1
)
{
barrier ( CLK_LOCAL_MEM_FENCE ) ;
return ;
}
else
{
int localColOffset = -1 ;
int globalColOffset = -1 ;
// cache extra
if ( get_local_id ( 0 ) < HALF_FILTER_SIZE )
{
localColOffset = get_local_id ( 0 ) ;
globalColOffset = -HALF_FILTER_SIZE ;
cached[ localRowOffset + get_local_id ( 0 ) ].x = input[ my * 3 - HALF_FILTER_SIZE * 3 ] ;
cached[ localRowOffset + get_local_id ( 0 ) ].y = input[ my * 3 - HALF_FILTER_SIZE * 3 + 1] ;
cached[ localRowOffset + get_local_id ( 0 ) ].z = input[ my * 3 - HALF_FILTER_SIZE * 3 + 2] ;
}
else if ( get_local_id ( 0 ) >= get_local_size ( 0 ) - HALF_FILTER_SIZE )
{
localColOffset = get_local_id ( 0 ) + TWICE_HALF_FILTER_SIZE ;
globalColOffset = HALF_FILTER_SIZE ;
cached[ myLocal + HALF_FILTER_SIZE ].x = input[ my * 3 + HALF_FILTER_SIZE * 3 ] ;
cached[ myLocal + HALF_FILTER_SIZE ].y = input[ my * 3 + HALF_FILTER_SIZE * 3 + 1] ;
cached[ myLocal + HALF_FILTER_SIZE ].z = input[ my * 3 + HALF_FILTER_SIZE * 3 + 2] ;
}
if ( get_local_id ( 1 ) < HALF_FILTER_SIZE )
{
cached[ get_local_id ( 1 ) * localRowLen + get_local_id ( 0 ) + HALF_FILTER_SIZE ].x = input[ my * 3 - HALF_FILTER_SIZE_IMAGE_W * 3 ] ;
cached[ get_local_id ( 1 ) * localRowLen + get_local_id ( 0 ) + HALF_FILTER_SIZE ].y = input[ my * 3 - HALF_FILTER_SIZE_IMAGE_W * 3 + 1] ;
cached[ get_local_id ( 1 ) * localRowLen + get_local_id ( 0 ) + HALF_FILTER_SIZE ].z = input[ my * 3 - HALF_FILTER_SIZE_IMAGE_W * 3 + 2] ;
if ( localColOffset > 0 )
{
cached[ get_local_id ( 1 ) * localRowLen + localColOffset ].x = input[ my * 3 - HALF_FILTER_SIZE_IMAGE_W * 3 + globalColOffset * 3] ;
cached[ get_local_id ( 1 ) * localRowLen + localColOffset ].y = input[ my * 3 - HALF_FILTER_SIZE_IMAGE_W * 3 + globalColOffset * 3 + 1] ;
cached[ get_local_id ( 1 ) * localRowLen + localColOffset ].z = input[ my * 3 - HALF_FILTER_SIZE_IMAGE_W * 3 + globalColOffset * 3 + 2] ;
}
}
else if ( get_local_id ( 1 ) >= get_local_size ( 1 ) -HALF_FILTER_SIZE )
{
int offset = ( get_local_id ( 1 ) + TWICE_HALF_FILTER_SIZE ) * localRowLen ;
cached[ offset + get_local_id ( 0 ) + HALF_FILTER_SIZE ].x = input[ my * 3 + HALF_FILTER_SIZE_IMAGE_W * 3 ] ;
cached[ offset + get_local_id ( 0 ) + HALF_FILTER_SIZE ].y = input[ my * 3 + HALF_FILTER_SIZE_IMAGE_W * 3 + 1] ;
cached[ offset + get_local_id ( 0 ) + HALF_FILTER_SIZE ].z = input[ my * 3 + HALF_FILTER_SIZE_IMAGE_W * 3 + 2] ;
if ( localColOffset > 0 )
{
cached[ offset + localColOffset ].x = input[ my * 3 + HALF_FILTER_SIZE_IMAGE_W * 3 + globalColOffset * 3] ;
cached[ offset + localColOffset ].y = input[ my * 3 + HALF_FILTER_SIZE_IMAGE_W * 3 + globalColOffset * 3 + 1] ;
cached[ offset + localColOffset ].z = input[ my * 3 + HALF_FILTER_SIZE_IMAGE_W * 3 + globalColOffset * 3 + 2] ;
}
}
// sync
barrier ( CLK_LOCAL_MEM_FENCE ) ;
// perform convolution
int fIndex = 0 ;
short sum = 0 ;
for ( int r = -HALF_FILTER_SIZE ; r <= HALF_FILTER_SIZE; r++)
{
int curRow = r * localRowLen ;
for ( int c = -HALF_FILTER_SIZE ; c <= HALF_FILTER_SIZE; c++, fIndex++)
{
if ( !FLIP_RB ) {
// sum += dot ( rgb_weights, cached[ myLocal + curRow + c ] ) * filter[ fIndex ] ;
sum += ( cached[ myLocal + curRow + c ].x / 3 + cached[ myLocal + curRow + c ].y / 2 + cached[ myLocal + curRow + c ].z / 9 ) * filter[ fIndex ] ;
} else {
// sum += dot ( bgr_weights, cached[ myLocal + curRow + c ] ) * filter[ fIndex ] ;
sum += ( cached[ myLocal + curRow + c ].x / 9 + cached[ myLocal + curRow + c ].y / 2 + cached[ myLocal + curRow + c ].z / 3 ) * filter[ fIndex ] ;
}
}
}
output[my] = sum ;
}
}