diff --git a/selfdrive/camerad/cameras/camera_common.cc b/selfdrive/camerad/cameras/camera_common.cc index 3e4814e65..bba65f170 100644 --- a/selfdrive/camerad/cameras/camera_common.cc +++ b/selfdrive/camerad/cameras/camera_common.cc @@ -38,26 +38,23 @@ public: "-cl-fast-relaxed-math -cl-denorms-are-zero " "-DFRAME_WIDTH=%d -DFRAME_HEIGHT=%d -DFRAME_STRIDE=%d -DFRAME_OFFSET=%d " "-DRGB_WIDTH=%d -DRGB_HEIGHT=%d -DRGB_STRIDE=%d " - "-DBAYER_FLIP=%d -DHDR=%d -DCAM_NUM=%d", + "-DBAYER_FLIP=%d -DHDR=%d -DCAM_NUM=%d%s", ci->frame_width, ci->frame_height, ci->frame_stride, ci->frame_offset, b->rgb_width, b->rgb_height, b->rgb_stride, - ci->bayer_flip, ci->hdr, s->camera_num); + ci->bayer_flip, ci->hdr, s->camera_num, s->camera_num==1 ? " -DVIGNETTING" : ""); const char *cl_file = "cameras/real_debayer.cl"; cl_program prg_debayer = cl_program_from_file(context, device_id, cl_file, args); krnl_ = CL_CHECK_ERR(clCreateKernel(prg_debayer, "debayer10", &err)); CL_CHECK(clReleaseProgram(prg_debayer)); } - void queue(cl_command_queue q, cl_mem cam_buf_cl, cl_mem buf_cl, int width, int height, float gain, float black_level, cl_event *debayer_event) { + void queue(cl_command_queue q, cl_mem cam_buf_cl, cl_mem buf_cl, int width, int height, cl_event *debayer_event) { CL_CHECK(clSetKernelArg(krnl_, 0, sizeof(cl_mem), &cam_buf_cl)); CL_CHECK(clSetKernelArg(krnl_, 1, sizeof(cl_mem), &buf_cl)); const size_t globalWorkSize[] = {size_t(width / 2), size_t(height / 2)}; const int debayer_local_worksize = 16; - constexpr int localMemSize = (debayer_local_worksize * 2 + 2) * (debayer_local_worksize * 2 + 2) * 2; const size_t localWorkSize[] = {debayer_local_worksize, debayer_local_worksize}; - CL_CHECK(clSetKernelArg(krnl_, 2, localMemSize, 0)); - CL_CHECK(clSetKernelArg(krnl_, 3, sizeof(float), &black_level)); CL_CHECK(clEnqueueNDRangeKernel(q, krnl_, 2, NULL, globalWorkSize, localWorkSize, 0, 0, debayer_event)); } @@ -150,15 +147,7 @@ bool CameraBuf::acquire() { cur_camera_buf = &camera_bufs[cur_buf_idx]; if (debayer) { - float gain = 0.0; - float black_level = 42.0; -#ifndef QCOM2 - gain = camera_state->digital_gain; - if ((int)gain == 0) gain = 1.0; -#else - if (camera_state->camera_id == CAMERA_ID_IMX390) black_level = 64.0; -#endif - debayer->queue(q, camrabuf_cl, cur_yuv_buf->buf_cl, rgb_width, rgb_height, gain, black_level, &event); + debayer->queue(q, camrabuf_cl, cur_yuv_buf->buf_cl, rgb_width, rgb_height, &event); } else { assert(rgb_stride == camera_state->ci.frame_stride); rgb2yuv->queue(q, camrabuf_cl, cur_rgb_buf->buf_cl); diff --git a/selfdrive/camerad/cameras/real_debayer.cl b/selfdrive/camerad/cameras/real_debayer.cl index 8dd926adf..dc6044ed5 100644 --- a/selfdrive/camerad/cameras/real_debayer.cl +++ b/selfdrive/camerad/cameras/real_debayer.cl @@ -1,12 +1,3 @@ -#ifdef HALF_AS_FLOAT -#define half float -#define half2 float2 -#define half3 float3 -#define half4 float4 -#else -#pragma OPENCL EXTENSION cl_khr_fp16 : enable -#endif - #define UV_WIDTH RGB_WIDTH / 2 #define UV_HEIGHT RGB_HEIGHT / 2 #define U_OFFSET RGB_WIDTH * RGB_HEIGHT @@ -17,171 +8,129 @@ #define RGB_TO_V(r, g, b) ((mul24(r, 56) - mul24(g, 47) - mul24(b, 9) + 0x8080) >> 8) #define AVERAGE(x, y, z, w) ((convert_ushort(x) + convert_ushort(y) + convert_ushort(z) + convert_ushort(w) + 1) >> 1) -// post wb CCM -const __constant half3 color_correction_0 = (half3)(1.82717181, -0.31231438, 0.07307673); -const __constant half3 color_correction_1 = (half3)(-0.5743977, 1.36858544, -0.53183455); -const __constant half3 color_correction_2 = (half3)(-0.25277411, -0.05627105, 1.45875782); +float3 color_correct(float3 rgb) { + // color correction + float3 x = rgb.x * (float3)(1.82717181, -0.31231438, 0.07307673); + x += rgb.y * (float3)(-0.5743977, 1.36858544, -0.53183455); + x += rgb.z * (float3)(-0.25277411, -0.05627105, 1.45875782); -// tone mapping params -const half gamma_k = 0.75; -const half gamma_b = 0.125; -const half mp = 0.01; // ideally midpoint should be adaptive -const half rk = 9 - 100*mp; + // tone mapping params + const float gamma_k = 0.75; + const float gamma_b = 0.125; + const float mp = 0.01; // ideally midpoint should be adaptive + const float rk = 9 - 100*mp; -inline half3 gamma_apply(half3 x) { // poly approximation for s curve return (x > mp) ? ((rk * (x-mp) * (1-(gamma_k*mp+gamma_b)) * (1+1/(rk*(1-mp))) / (1+rk*(x-mp))) + gamma_k*mp + gamma_b) : ((rk * (x-mp) * (gamma_k*mp+gamma_b) * (1+1/(rk*mp)) / (1-rk*(x-mp))) + gamma_k*mp + gamma_b); } -inline half3 color_correct(half3 rgb) { - half3 ret = (half)rgb.x * color_correction_0; - ret += (half)rgb.y * color_correction_1; - ret += (half)rgb.z * color_correction_2; - return gamma_apply(ret); -} - -inline half get_vignetting_s(float r) { +float get_vignetting_s(float r) { if (r < 62500) { - return (half)(1.0f + 0.0000008f*r); + return (1.0f + 0.0000008f*r); } else if (r < 490000) { - return (half)(0.9625f + 0.0000014f*r); + return (0.9625f + 0.0000014f*r); } else if (r < 1102500) { - return (half)(1.26434f + 0.0000000000016f*r*r); + return (1.26434f + 0.0000000000016f*r*r); } else { - return (half)(0.53503625f + 0.0000000000022f*r*r); + return (0.53503625f + 0.0000000000022f*r*r); } } -inline half val_from_10(const uchar * source, int gx, int gy, half black_level) { - // parse 12bit - int start = gy * FRAME_STRIDE + (3 * (gx / 2)) + (FRAME_STRIDE * FRAME_OFFSET); - int offset = gx % 2; - uint major = (uint)source[start + offset] << 4; - uint minor = (source[start + 2] >> (4 * offset)) & 0xf; - half pv = ((half)(major + minor)) / 4.0; - - // normalize - pv = max((half)0.0, pv - black_level); - pv /= (1024.0 - black_level); - - // correct vignetting - if (CAM_NUM == 1) { // fcamera - gx = (gx - RGB_WIDTH/2); - gy = (gy - RGB_HEIGHT/2); - pv *= get_vignetting_s(gx*gx + gy*gy); - } - - pv = clamp(pv, (half)0.0, (half)1.0); - return pv; +float4 val4_from_12(uchar8 pvs, float gain) { + uint4 parsed = (uint4)(((uint)pvs.s0<<4) + (pvs.s1>>4), // is from the previous 10 bit + ((uint)pvs.s2<<4) + (pvs.s4&0xF), + ((uint)pvs.s3<<4) + (pvs.s4>>4), + ((uint)pvs.s5<<4) + (pvs.s7&0xF)); + // normalize and scale + float4 pv = (convert_float4(parsed) - 168.0) / (4096.0 - 168.0); + return clamp(pv*gain, 0.0, 1.0); } -inline half get_k(half a, half b, half c, half d) { +float get_k(float a, float b, float c, float d) { return 2.0 - (fabs(a - b) + fabs(c - d)); } -__kernel void debayer10(const __global uchar * in, - __global uchar * out, - __local half * cached, - float black_level - ) +__kernel void debayer10(const __global uchar * in, __global uchar * out) { const int gid_x = get_global_id(0); const int gid_y = get_global_id(1); - const int lid_x = get_local_id(0); - const int lid_y = get_local_id(1); - - const int localRowLen = mad24(get_local_size(0), 2, 2); // 2 padding - const int localColLen = mad24(get_local_size(1), 2, 2); - - const int x_global = mul24(gid_x, 2); - const int y_global = mul24(gid_y, 2); + const int y_top_mod = (gid_y == 0) ? 2: 0; + const int y_bot_mod = (gid_y == (RGB_HEIGHT/2 - 1)) ? 1: 3; - const int x_local = mad24(lid_x, 2, 1); - const int y_local = mad24(lid_y, 2, 1); - - const int x_global_mod = (gid_x == 0 || gid_x == get_global_size(0) - 1) ? -1: 1; - const int y_global_mod = (gid_y == 0 || gid_y == get_global_size(1) - 1) ? -1: 1; + float3 rgb; + uchar3 rgb_out[4]; - int localColOffset = 0; - int globalColOffset; + int start = (2 * gid_y - 1) * FRAME_STRIDE + (3 * gid_x - 2) + (FRAME_STRIDE * FRAME_OFFSET); - cached[mad24(y_local + 0, localRowLen, x_local + 0)] = val_from_10(in, x_global + 0, y_global + 0, black_level); - cached[mad24(y_local + 0, localRowLen, x_local + 1)] = val_from_10(in, x_global + 1, y_global + 0, black_level); - cached[mad24(y_local + 1, localRowLen, x_local + 0)] = val_from_10(in, x_global + 0, y_global + 1, black_level); - cached[mad24(y_local + 1, localRowLen, x_local + 1)] = val_from_10(in, x_global + 1, y_global + 1, black_level); + // read in 8x4 chars + uchar8 dat[4]; + dat[0] = vload8(0, in + start + FRAME_STRIDE*y_top_mod); + dat[1] = vload8(0, in + start + FRAME_STRIDE*1); + dat[2] = vload8(0, in + start + FRAME_STRIDE*2); + dat[3] = vload8(0, in + start + FRAME_STRIDE*y_bot_mod); - if (lid_x == 0) { // left edge - localColOffset = -1; - globalColOffset = -x_global_mod; - cached[mad24(y_local + 0, localRowLen, x_local - 1)] = val_from_10(in, x_global - x_global_mod, y_global + 0, black_level); - cached[mad24(y_local + 1, localRowLen, x_local - 1)] = val_from_10(in, x_global - x_global_mod, y_global + 1, black_level); - } else if (lid_x == get_local_size(0) - 1) { // right edge - localColOffset = 2; - globalColOffset = x_global_mod + 1; - cached[mad24(y_local + 0, localRowLen, x_local + 2)] = val_from_10(in, x_global + x_global_mod + 1, y_global + 0, black_level); - cached[mad24(y_local + 1, localRowLen, x_local + 2)] = val_from_10(in, x_global + x_global_mod + 1, y_global + 1, black_level); - } - - if (lid_y == 0) { // top row - cached[mad24(y_local - 1, localRowLen, x_local + 0)] = val_from_10(in, x_global + 0, y_global - y_global_mod, black_level); - cached[mad24(y_local - 1, localRowLen, x_local + 1)] = val_from_10(in, x_global + 1, y_global - y_global_mod, black_level); - if (localColOffset != 0) { // cache corners - cached[mad24(y_local - 1, localRowLen, x_local + localColOffset)] = val_from_10(in, x_global + globalColOffset, y_global - y_global_mod, black_level); - } - } else if (lid_y == get_local_size(1) - 1) { // bottom row - cached[mad24(y_local + 2, localRowLen, x_local + 0)] = val_from_10(in, x_global + 0, y_global + y_global_mod + 1, black_level); - cached[mad24(y_local + 2, localRowLen, x_local + 1)] = val_from_10(in, x_global + 1, y_global + y_global_mod + 1, black_level); - if (localColOffset != 0) { // cache corners - cached[mad24(y_local + 2, localRowLen, x_local + localColOffset)] = val_from_10(in, x_global + globalColOffset, y_global + y_global_mod + 1, black_level); - } + // correct vignetting + #if VIGNETTING + int gx = (gid_x*2 - RGB_WIDTH/2); + int gy = (gid_y*2 - RGB_HEIGHT/2); + const float gain = get_vignetting_s(gx*gx + gy*gy); + #else + const float gain = 1.0; + #endif + + // process them to floats + float4 va = val4_from_12(dat[0], gain); + float4 vb = val4_from_12(dat[1], gain); + float4 vc = val4_from_12(dat[2], gain); + float4 vd = val4_from_12(dat[3], gain); + + if (gid_x == 0) { + va.s0 = va.s2; + vb.s0 = vb.s2; + vc.s0 = vc.s2; + vd.s0 = vd.s2; + } else if (gid_x == RGB_WIDTH/2 - 1) { + va.s3 = va.s1; + vb.s3 = vb.s1; + vc.s3 = vc.s1; + vd.s3 = vd.s1; } - // sync - barrier(CLK_LOCAL_MEM_FENCE); - - half3 rgb; - uchar3 rgb_out[4]; - - const half4 va = vload4(0, cached + mad24(lid_y * 2 + 0, localRowLen, lid_x * 2)); - const half4 vb = vload4(0, cached + mad24(lid_y * 2 + 1, localRowLen, lid_x * 2)); - const half4 vc = vload4(0, cached + mad24(lid_y * 2 + 2, localRowLen, lid_x * 2)); - const half4 vd = vload4(0, cached + mad24(lid_y * 2 + 3, localRowLen, lid_x * 2)); - // a simplified version of https://opensignalprocessingjournal.com/contents/volumes/V6/TOSIGPJ-6-1/TOSIGPJ-6-1.pdf - const half k01 = get_k(va.s0, vb.s1, va.s2, vb.s1); - const half k02 = get_k(va.s2, vb.s1, vc.s2, vb.s1); - const half k03 = get_k(vc.s0, vb.s1, vc.s2, vb.s1); - const half k04 = get_k(va.s0, vb.s1, vc.s0, vb.s1); + const float k01 = get_k(va.s0, vb.s1, va.s2, vb.s1); + const float k02 = get_k(va.s2, vb.s1, vc.s2, vb.s1); + const float k03 = get_k(vc.s0, vb.s1, vc.s2, vb.s1); + const float k04 = get_k(va.s0, vb.s1, vc.s0, vb.s1); rgb.x = (k02*vb.s2+k04*vb.s0)/(k02+k04); // R_G1 rgb.y = vb.s1; // G1(R) rgb.z = (k01*va.s1+k03*vc.s1)/(k01+k03); // B_G1 rgb_out[0] = convert_uchar3_sat(color_correct(clamp(rgb, 0.0, 1.0)) * 255.0); - const half k11 = get_k(va.s1, vc.s1, va.s3, vc.s3); - const half k12 = get_k(va.s2, vb.s1, vb.s3, vc.s2); - const half k13 = get_k(va.s1, va.s3, vc.s1, vc.s3); - const half k14 = get_k(va.s2, vb.s3, vc.s2, vb.s1); + const float k11 = get_k(va.s1, vc.s1, va.s3, vc.s3); + const float k12 = get_k(va.s2, vb.s1, vb.s3, vc.s2); + const float k13 = get_k(va.s1, va.s3, vc.s1, vc.s3); + const float k14 = get_k(va.s2, vb.s3, vc.s2, vb.s1); rgb.x = vb.s2; // R rgb.y = (k11*(va.s2+vc.s2)*0.5+k13*(vb.s3+vb.s1)*0.5)/(k11+k13); // G_R rgb.z = (k12*(va.s3+vc.s1)*0.5+k14*(va.s1+vc.s3)*0.5)/(k12+k14); // B_R rgb_out[1] = convert_uchar3_sat(color_correct(clamp(rgb, 0.0, 1.0)) * 255.0); - const half k21 = get_k(vb.s0, vd.s0, vb.s2, vd.s2); - const half k22 = get_k(vb.s1, vc.s0, vc.s2, vd.s1); - const half k23 = get_k(vb.s0, vb.s2, vd.s0, vd.s2); - const half k24 = get_k(vb.s1, vc.s2, vd.s1, vc.s0); + const float k21 = get_k(vb.s0, vd.s0, vb.s2, vd.s2); + const float k22 = get_k(vb.s1, vc.s0, vc.s2, vd.s1); + const float k23 = get_k(vb.s0, vb.s2, vd.s0, vd.s2); + const float k24 = get_k(vb.s1, vc.s2, vd.s1, vc.s0); rgb.x = (k22*(vb.s2+vd.s0)*0.5+k24*(vb.s0+vd.s2)*0.5)/(k22+k24); // R_B rgb.y = (k21*(vb.s1+vd.s1)*0.5+k23*(vc.s2+vc.s0)*0.5)/(k21+k23); // G_B rgb.z = vc.s1; // B rgb_out[2] = convert_uchar3_sat(color_correct(clamp(rgb, 0.0, 1.0)) * 255.0); - const half k31 = get_k(vb.s1, vc.s2, vb.s3, vc.s2); - const half k32 = get_k(vb.s3, vc.s2, vd.s3, vc.s2); - const half k33 = get_k(vd.s1, vc.s2, vd.s3, vc.s2); - const half k34 = get_k(vb.s1, vc.s2, vd.s1, vc.s2); + const float k31 = get_k(vb.s1, vc.s2, vb.s3, vc.s2); + const float k32 = get_k(vb.s3, vc.s2, vd.s3, vc.s2); + const float k33 = get_k(vd.s1, vc.s2, vd.s3, vc.s2); + const float k34 = get_k(vb.s1, vc.s2, vd.s1, vc.s2); rgb.x = (k31*vb.s2+k33*vd.s2)/(k31+k33); // R_G2 rgb.y = vc.s2; // G2(B) rgb.z = (k32*vc.s3+k34*vc.s1)/(k32+k34); // B_G2 diff --git a/selfdrive/hardware/tici/test_power_draw.py b/selfdrive/hardware/tici/test_power_draw.py index 0435193bc..15892ac5e 100755 --- a/selfdrive/hardware/tici/test_power_draw.py +++ b/selfdrive/hardware/tici/test_power_draw.py @@ -19,7 +19,7 @@ class Proc: warmup: float = 3. PROCS = [ - Proc('camerad', 2.17), + Proc('camerad', 2.02), Proc('modeld', 0.95), Proc('dmonitoringmodeld', 0.25), Proc('encoderd', 0.42), diff --git a/selfdrive/test/process_replay/test_debayer.py b/selfdrive/test/process_replay/test_debayer.py index a70d478ff..a17c71b60 100755 --- a/selfdrive/test/process_replay/test_debayer.py +++ b/selfdrive/test/process_replay/test_debayer.py @@ -80,8 +80,7 @@ def debayer_frame(ctx, debayer_prg, data, rgb=False): yuv_g = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, FRAME_WIDTH * FRAME_HEIGHT + UV_SIZE * 2) local_worksize = (20, 20) if TICI else (4, 4) - local_mem = cl.LocalMemory(3528 if TICI else 400) - ev1 = debayer_prg.debayer10(q, (UV_WIDTH, UV_HEIGHT), local_worksize, cam_g, yuv_g, local_mem, np.float32(42)) + ev1 = debayer_prg.debayer10(q, (UV_WIDTH, UV_HEIGHT), local_worksize, cam_g, yuv_g) cl.enqueue_copy(q, yuv_buff, yuv_g, wait_for=[ev1]).wait() cl.enqueue_barrier(q)