#include "ar0231_cl.h" #include "ox03c10_cl.h" #include "os04c10_cl.h" #define UV_WIDTH RGB_WIDTH / 2 #define UV_HEIGHT RGB_HEIGHT / 2 #define RGB_TO_Y(r, g, b) ((((mul24(b, 13) + mul24(g, 65) + mul24(r, 33)) + 64) >> 7) + 16) #define RGB_TO_U(r, g, b) ((mul24(b, 56) - mul24(g, 37) - mul24(r, 19) + 0x8080) >> 8) #define RGB_TO_V(r, g, b) ((mul24(r, 56) - mul24(g, 47) - mul24(b, 9) + 0x8080) >> 8) #define AVERAGE(x, y, z, w) ((convert_ushort(x) + convert_ushort(y) + convert_ushort(z) + convert_ushort(w) + 1) >> 1) float get_vignetting_s(float r) { #if IS_OS r = r / 2.2545f; #endif if (r < 62500) { return (1.0f + 0.0000008f*r); } else if (r < 490000) { return (0.9625f + 0.0000014f*r); } else if (r < 1102500) { return (1.26434f + 0.0000000000016f*r*r); } else { return (0.53503625f + 0.0000000000022f*r*r); } } float4 val4_from_12(uchar8 pvs, float gain) { uint4 parsed = (uint4)(((uint)pvs.s0<<4) + (pvs.s1>>4), // is from the previous 10 bit ((uint)pvs.s2<<4) + (pvs.s4&0xF), ((uint)pvs.s3<<4) + (pvs.s4>>4), ((uint)pvs.s5<<4) + (pvs.s7&0xF)); #if IS_OX // PWL //float4 pv = (convert_float4(parsed) - 64.0) / (4096.0 - 64.0); float4 pv = {ox03c10_lut[parsed.s0], ox03c10_lut[parsed.s1], ox03c10_lut[parsed.s2], ox03c10_lut[parsed.s3]}; // it's a 24 bit signal, center in the middle 8 bits return clamp(pv*gain*256.0, 0.0, 1.0); #else // AR // normalize and scale float4 pv = (convert_float4(parsed) - 168.0) / (4096.0 - 168.0); return clamp(pv*gain, 0.0, 1.0); #endif } float4 val4_from_10(uchar8 pvs, uchar ext, bool aligned, float gain) { uint4 parsed; if (aligned) { parsed = (uint4)(((uint)pvs.s0 << 2) + (pvs.s1 & 0b00000011), ((uint)pvs.s2 << 2) + ((pvs.s6 & 0b11000000) / 64), ((uint)pvs.s3 << 2) + ((pvs.s6 & 0b00110000) / 16), ((uint)pvs.s4 << 2) + ((pvs.s6 & 0b00001100) / 4)); } else { parsed = (uint4)(((uint)pvs.s0 << 2) + ((pvs.s3 & 0b00110000) / 16), ((uint)pvs.s1 << 2) + ((pvs.s3 & 0b00001100) / 4), ((uint)pvs.s2 << 2) + ((pvs.s3 & 0b00000011)), ((uint)pvs.s4 << 2) + ((ext & 0b11000000) / 64)); } float4 pv = (convert_float4(parsed) - 64.0) / (1024.0 - 64.0); return clamp(pv*gain, 0.0, 1.0); } float combine_pvs(float lv, float sv, int expo) { float svc = fmax(sv * expo, 61376.0); float svd = sv * fmin(expo, 8.0) / 8; if (expo > 64) { if (lv < 959) { return lv / (65536.0 - 64.0); } else { return (svc / 64) / (65536.0 - 64.0); } } else { if (lv > 32) { return (lv * 64 / fmax(expo, 8.0)) / (65536.0 - 64.0); } else { return svd / (65536.0 - 64.0); } } } float4 val4_from_10x2(uchar8 long_pvs, uchar long_ext, uchar8 short_pvs, uchar short_ext, bool aligned, float gain, int expo) { int8 parsed; if (aligned) { parsed = (int8)(((int)long_pvs.s0 << 2) + (long_pvs.s1 & 0b00000011), ((int)long_pvs.s2 << 2) + ((long_pvs.s6 & 0b11000000) / 64), ((int)long_pvs.s3 << 2) + ((long_pvs.s6 & 0b00110000) / 16), ((int)long_pvs.s4 << 2) + ((long_pvs.s6 & 0b00001100) / 4), ((int)short_pvs.s0 << 2) + (short_pvs.s1 & 0b00000011), ((int)short_pvs.s2 << 2) + ((short_pvs.s6 & 0b11000000) / 64), ((int)short_pvs.s3 << 2) + ((short_pvs.s6 & 0b00110000) / 16), ((int)short_pvs.s4 << 2) + ((short_pvs.s6 & 0b00001100) / 4)); } else { parsed = (int8)(((int)long_pvs.s0 << 2) + ((long_pvs.s3 & 0b00110000) / 16), ((int)long_pvs.s1 << 2) + ((long_pvs.s3 & 0b00001100) / 4), ((int)long_pvs.s2 << 2) + ((long_pvs.s3 & 0b00000011)), ((int)long_pvs.s4 << 2) + ((long_ext & 0b11000000) / 64), ((int)short_pvs.s0 << 2) + ((short_pvs.s3 & 0b00110000) / 16), ((int)short_pvs.s1 << 2) + ((short_pvs.s3 & 0b00001100) / 4), ((int)short_pvs.s2 << 2) + ((short_pvs.s3 & 0b00000011)), ((int)short_pvs.s4 << 2) + ((short_ext & 0b11000000) / 64)); } float8 pf = convert_float8(parsed - 64); float4 pv; pv.s0 = combine_pvs(pf.s0, pf.s4, expo); pv.s1 = combine_pvs(pf.s1, pf.s5, expo); pv.s2 = combine_pvs(pf.s2, pf.s6, expo); pv.s3 = combine_pvs(pf.s3, pf.s7, expo); return clamp(pv*gain, 0.0, 1.0); } float get_k(float a, float b, float c, float d) { return 2.0 - (fabs(a - b) + fabs(c - d)); } __kernel void process_raw(const __global uchar * in, __global uchar * out, int expo_time) { const int gid_x = get_global_id(0); const int gid_y = get_global_id(1); const int row_before_offset = (gid_y == 0) ? 2 : 0; const int row_after_offset = (gid_y == (RGB_HEIGHT/2 - 1)) ? 1 : 3; float3 rgb; uchar3 rgb_out[4]; #if IS_BGGR constant int row_read_order[] = {3, 2, 1, 0}; constant int rgb_write_order[] = {2, 3, 0, 1}; #else constant int row_read_order[] = {0, 1, 2, 3}; constant int rgb_write_order[] = {0, 1, 2, 3}; #endif int start_idx; #if IS_10BIT bool aligned10; if (gid_x % 2 == 0) { aligned10 = true; start_idx = (2 * gid_y - 1) * FRAME_STRIDE + (5 * gid_x / 2 - 2) + (FRAME_STRIDE * FRAME_OFFSET); } else { aligned10 = false; start_idx = (2 * gid_y - 1) * FRAME_STRIDE + (5 * (gid_x - 1) / 2 + 1) + (FRAME_STRIDE * FRAME_OFFSET); } #else start_idx = (2 * gid_y - 1) * FRAME_STRIDE + (3 * gid_x - 2) + (FRAME_STRIDE * FRAME_OFFSET); #endif // read in 8x4 chars uchar8 dat[4]; dat[0] = vload8(0, in + start_idx + FRAME_STRIDE*row_before_offset); if (gid_x == 0 && gid_y == 0) { // this wasn't a problem due to extra rows dat[1] = vload8(0, in + start_idx + FRAME_STRIDE*1 + 2); dat[1] = (uchar8)(0, 0, dat[1].s0, dat[1].s1, dat[1].s2, dat[1].s3, dat[1].s4, dat[1].s5); } else { dat[1] = vload8(0, in + start_idx + FRAME_STRIDE*1); } dat[2] = vload8(0, in + start_idx + FRAME_STRIDE*2); dat[3] = vload8(0, in + start_idx + FRAME_STRIDE*row_after_offset); #if IS_HDR uchar8 short_dat[4]; short_dat[0] = vload8(0, in + start_idx + FRAME_STRIDE*(row_before_offset+HDR_OFFSET/2) + FRAME_STRIDE/2); short_dat[1] = vload8(0, in + start_idx + FRAME_STRIDE*(1+HDR_OFFSET/2) + FRAME_STRIDE/2); short_dat[2] = vload8(0, in + start_idx + FRAME_STRIDE*(2+HDR_OFFSET/2) + FRAME_STRIDE/2); short_dat[3] = vload8(0, in + start_idx + FRAME_STRIDE*(row_after_offset+HDR_OFFSET/2) + FRAME_STRIDE/2); #endif // need extra bit for 10-bit #if IS_10BIT uchar extra[4]; if (!aligned10) { extra[0] = in[start_idx + FRAME_STRIDE*row_before_offset + 8]; extra[1] = in[start_idx + FRAME_STRIDE*1 + 8]; extra[2] = in[start_idx + FRAME_STRIDE*2 + 8]; extra[3] = in[start_idx + FRAME_STRIDE*row_after_offset + 8]; } #if IS_HDR uchar short_extra[4]; if (!aligned10) { short_extra[0] = in[start_idx + FRAME_STRIDE*(row_before_offset+HDR_OFFSET/2) + FRAME_STRIDE/2 + 8]; short_extra[1] = in[start_idx + FRAME_STRIDE*(1+HDR_OFFSET/2) + FRAME_STRIDE/2 + 8]; short_extra[2] = in[start_idx + FRAME_STRIDE*(2+HDR_OFFSET/2) + FRAME_STRIDE/2 + 8]; short_extra[3] = in[start_idx + FRAME_STRIDE*(row_after_offset+HDR_OFFSET/2) + FRAME_STRIDE/2 + 8]; } #endif #endif // correct vignetting #if VIGNETTING int gx = (gid_x*2 - RGB_WIDTH/2); int gy = (gid_y*2 - RGB_HEIGHT/2); const float gain = get_vignetting_s(gx*gx + gy*gy); #else const float gain = 1.0; #endif float4 v_rows[4]; // parse into floats #if IS_10BIT #if IS_HDR v_rows[row_read_order[0]] = val4_from_10x2(dat[0], extra[0], short_dat[0], short_extra[0], aligned10, gain, expo_time); v_rows[row_read_order[1]] = val4_from_10x2(dat[1], extra[1], short_dat[1], short_extra[1], aligned10, gain, expo_time); v_rows[row_read_order[2]] = val4_from_10x2(dat[2], extra[2], short_dat[2], short_extra[2], aligned10, gain, expo_time); v_rows[row_read_order[3]] = val4_from_10x2(dat[3], extra[3], short_dat[3], short_extra[3], aligned10, gain, expo_time); #else v_rows[row_read_order[0]] = val4_from_10(dat[0], extra[0], aligned10, gain); v_rows[row_read_order[1]] = val4_from_10(dat[1], extra[1], aligned10, gain); v_rows[row_read_order[2]] = val4_from_10(dat[2], extra[2], aligned10, gain); v_rows[row_read_order[3]] = val4_from_10(dat[3], extra[3], aligned10, gain); #endif #else v_rows[row_read_order[0]] = val4_from_12(dat[0], gain); v_rows[row_read_order[1]] = val4_from_12(dat[1], gain); v_rows[row_read_order[2]] = val4_from_12(dat[2], gain); v_rows[row_read_order[3]] = val4_from_12(dat[3], gain); #endif // mirror padding if (gid_x == 0) { v_rows[0].s0 = v_rows[0].s2; v_rows[1].s0 = v_rows[1].s2; v_rows[2].s0 = v_rows[2].s2; v_rows[3].s0 = v_rows[3].s2; } else if (gid_x == RGB_WIDTH/2 - 1) { v_rows[0].s3 = v_rows[0].s1; v_rows[1].s3 = v_rows[1].s1; v_rows[2].s3 = v_rows[2].s1; v_rows[3].s3 = v_rows[3].s1; } // a simplified version of https://opensignalprocessingjournal.com/contents/volumes/V6/TOSIGPJ-6-1/TOSIGPJ-6-1.pdf const float k01 = get_k(v_rows[0].s0, v_rows[1].s1, v_rows[0].s2, v_rows[1].s1); const float k02 = get_k(v_rows[0].s2, v_rows[1].s1, v_rows[2].s2, v_rows[1].s1); const float k03 = get_k(v_rows[2].s0, v_rows[1].s1, v_rows[2].s2, v_rows[1].s1); const float k04 = get_k(v_rows[0].s0, v_rows[1].s1, v_rows[2].s0, v_rows[1].s1); rgb.x = (k02*v_rows[1].s2+k04*v_rows[1].s0)/(k02+k04); // R_G1 rgb.y = v_rows[1].s1; // G1(R) rgb.z = (k01*v_rows[0].s1+k03*v_rows[2].s1)/(k01+k03); // B_G1 rgb_out[rgb_write_order[0]] = convert_uchar3_sat(apply_gamma(color_correct(clamp(rgb, 0.0, 1.0)), expo_time) * 255.0); const float k11 = get_k(v_rows[0].s1, v_rows[2].s1, v_rows[0].s3, v_rows[2].s3); const float k12 = get_k(v_rows[0].s2, v_rows[1].s1, v_rows[1].s3, v_rows[2].s2); const float k13 = get_k(v_rows[0].s1, v_rows[0].s3, v_rows[2].s1, v_rows[2].s3); const float k14 = get_k(v_rows[0].s2, v_rows[1].s3, v_rows[2].s2, v_rows[1].s1); rgb.x = v_rows[1].s2; // R rgb.y = (k11*(v_rows[0].s2+v_rows[2].s2)*0.5+k13*(v_rows[1].s3+v_rows[1].s1)*0.5)/(k11+k13); // G_R rgb.z = (k12*(v_rows[0].s3+v_rows[2].s1)*0.5+k14*(v_rows[0].s1+v_rows[2].s3)*0.5)/(k12+k14); // B_R rgb_out[rgb_write_order[1]] = convert_uchar3_sat(apply_gamma(color_correct(clamp(rgb, 0.0, 1.0)), expo_time) * 255.0); const float k21 = get_k(v_rows[1].s0, v_rows[3].s0, v_rows[1].s2, v_rows[3].s2); const float k22 = get_k(v_rows[1].s1, v_rows[2].s0, v_rows[2].s2, v_rows[3].s1); const float k23 = get_k(v_rows[1].s0, v_rows[1].s2, v_rows[3].s0, v_rows[3].s2); const float k24 = get_k(v_rows[1].s1, v_rows[2].s2, v_rows[3].s1, v_rows[2].s0); rgb.x = (k22*(v_rows[1].s2+v_rows[3].s0)*0.5+k24*(v_rows[1].s0+v_rows[3].s2)*0.5)/(k22+k24); // R_B rgb.y = (k21*(v_rows[1].s1+v_rows[3].s1)*0.5+k23*(v_rows[2].s2+v_rows[2].s0)*0.5)/(k21+k23); // G_B rgb.z = v_rows[2].s1; // B rgb_out[rgb_write_order[2]] = convert_uchar3_sat(apply_gamma(color_correct(clamp(rgb, 0.0, 1.0)), expo_time) * 255.0); const float k31 = get_k(v_rows[1].s1, v_rows[2].s2, v_rows[1].s3, v_rows[2].s2); const float k32 = get_k(v_rows[1].s3, v_rows[2].s2, v_rows[3].s3, v_rows[2].s2); const float k33 = get_k(v_rows[3].s1, v_rows[2].s2, v_rows[3].s3, v_rows[2].s2); const float k34 = get_k(v_rows[1].s1, v_rows[2].s2, v_rows[3].s1, v_rows[2].s2); rgb.x = (k31*v_rows[1].s2+k33*v_rows[3].s2)/(k31+k33); // R_G2 rgb.y = v_rows[2].s2; // G2(B) rgb.z = (k32*v_rows[2].s3+k34*v_rows[2].s1)/(k32+k34); // B_G2 rgb_out[rgb_write_order[3]] = convert_uchar3_sat(apply_gamma(color_correct(clamp(rgb, 0.0, 1.0)), expo_time) * 255.0); // write ys uchar2 yy = (uchar2)( RGB_TO_Y(rgb_out[0].s0, rgb_out[0].s1, rgb_out[0].s2), RGB_TO_Y(rgb_out[1].s0, rgb_out[1].s1, rgb_out[1].s2) ); vstore2(yy, 0, out + mad24(gid_y * 2, YUV_STRIDE, gid_x * 2)); yy = (uchar2)( RGB_TO_Y(rgb_out[2].s0, rgb_out[2].s1, rgb_out[2].s2), RGB_TO_Y(rgb_out[3].s0, rgb_out[3].s1, rgb_out[3].s2) ); vstore2(yy, 0, out + mad24(gid_y * 2 + 1, YUV_STRIDE, gid_x * 2)); // write uvs const short ar = AVERAGE(rgb_out[0].s0, rgb_out[1].s0, rgb_out[2].s0, rgb_out[3].s0); const short ag = AVERAGE(rgb_out[0].s1, rgb_out[1].s1, rgb_out[2].s1, rgb_out[3].s1); const short ab = AVERAGE(rgb_out[0].s2, rgb_out[1].s2, rgb_out[2].s2, rgb_out[3].s2); uchar2 uv = (uchar2)( RGB_TO_U(ar, ag, ab), RGB_TO_V(ar, ag, ab) ); vstore2(uv, 0, out + UV_OFFSET + mad24(gid_y, YUV_STRIDE, gid_x * 2)); }