# define INTER_BITS 5
# define INTER_TAB_SIZE ( 1 << INTER_BITS )
# define INTER_SCALE 1.f / INTER_TAB_SIZE
# define INTER_REMAP_COEF_BITS 15
# define INTER_REMAP_COEF_SCALE ( 1 << INTER_REMAP_COEF_BITS )
__kernel void warpPerspective ( __global const uchar * src,
int src_row_stride, int src_px_stride, int src_offset, int src_rows, int src_cols,
__global uchar * dst,
int dst_row_stride, int dst_offset, int dst_rows, int dst_cols,
__constant float * M )
{
int dx = get_global_id ( 0 ) ;
int dy = get_global_id ( 1 ) ;
if ( dx < dst_cols && dy < dst_rows )
{
float X0 = M[0] * dx + M[1] * dy + M[2] ;
float Y0 = M[3] * dx + M[4] * dy + M[5] ;
float W = M[6] * dx + M[7] * dy + M[8] ;
W = W != 0.0f ? INTER_TAB_SIZE / W : 0.0f ;
int X = rint ( X0 * W ) , Y = rint ( Y0 * W ) ;
int sx = convert_short_sat ( X >> INTER_BITS ) ;
int sy = convert_short_sat ( Y >> INTER_BITS ) ;
short sx_clamp = clamp ( sx, 0 , src_cols - 1 ) ;
short sx_p1_clamp = clamp ( sx + 1 , 0 , src_cols - 1 ) ;
short sy_clamp = clamp ( sy, 0 , src_rows - 1 ) ;
short sy_p1_clamp = clamp ( sy + 1 , 0 , src_rows - 1 ) ;
int v0 = convert_int ( src[mad24 ( sy_clamp, src_row_stride, src_offset + sx_clamp*src_px_stride ) ] ) ;
int v1 = convert_int ( src[mad24 ( sy_clamp, src_row_stride, src_offset + sx_p1_clamp*src_px_stride ) ] ) ;
int v2 = convert_int ( src[mad24 ( sy_p1_clamp, src_row_stride, src_offset + sx_clamp*src_px_stride ) ] ) ;
int v3 = convert_int ( src[mad24 ( sy_p1_clamp, src_row_stride, src_offset + sx_p1_clamp*src_px_stride ) ] ) ;
short ay = ( short ) ( Y & ( INTER_TAB_SIZE - 1 ) ) ;
short ax = ( short ) ( X & ( INTER_TAB_SIZE - 1 ) ) ;
float taby = 1.f/INTER_TAB_SIZE*ay ;
float tabx = 1.f/INTER_TAB_SIZE*ax ;
int dst_index = mad24 ( dy, dst_row_stride, dst_offset + dx ) ;
int itab0 = convert_short_sat_rte ( ( 1.0f-taby ) * ( 1.0f-tabx ) * INTER_REMAP_COEF_SCALE ) ;
int itab1 = convert_short_sat_rte ( ( 1.0f-taby ) *tabx * INTER_REMAP_COEF_SCALE ) ;
int itab2 = convert_short_sat_rte ( taby* ( 1.0f-tabx ) * INTER_REMAP_COEF_SCALE ) ;
int itab3 = convert_short_sat_rte ( taby*tabx * INTER_REMAP_COEF_SCALE ) ;
int val = v0 * itab0 + v1 * itab1 + v2 * itab2 + v3 * itab3 ;
uchar pix = convert_uchar_sat ( ( val + ( 1 << ( INTER_REMAP_COEF_BITS-1 ) ) ) >> INTER_REMAP_COEF_BITS ) ;
dst[dst_index] = pix ;
}
}