make debayer faster (#24557)

* remove local caching

* remove local caching

* reduce camerad power

* break stupid imx390 black level support

* handle edges

* now 13ms, vignetting is 'slightly' less correct

* halfs->floats and inlines, down to 12.9ms

* oops, fix float

* val from 12 is ushort

* don't decide vignetting in the debayer kernel

* 7.77 ms

* adding back black level support was free

* Revert "adding back black level support was free"

This reverts commit a841d17727.

* minor

* rip out unused gain and black level, remove print

* save 150mW

* fix replay test

* fix top/bottom rows

* lame left right edge fix

Co-authored-by: Comma Device <device@comma.ai>
Co-authored-by: Joost Wooning <jwooning@gmail.com>
old-commit-hash: 72b52f8b7d
taco
George Hotz 3 years ago committed by GitHub
parent a6bcd14abd
commit 3ee5b1bf07
  1. 19
      selfdrive/camerad/cameras/camera_common.cc
  2. 205
      selfdrive/camerad/cameras/real_debayer.cl
  3. 2
      selfdrive/hardware/tici/test_power_draw.py
  4. 3
      selfdrive/test/process_replay/test_debayer.py

@ -38,26 +38,23 @@ public:
"-cl-fast-relaxed-math -cl-denorms-are-zero "
"-DFRAME_WIDTH=%d -DFRAME_HEIGHT=%d -DFRAME_STRIDE=%d -DFRAME_OFFSET=%d "
"-DRGB_WIDTH=%d -DRGB_HEIGHT=%d -DRGB_STRIDE=%d "
"-DBAYER_FLIP=%d -DHDR=%d -DCAM_NUM=%d",
"-DBAYER_FLIP=%d -DHDR=%d -DCAM_NUM=%d%s",
ci->frame_width, ci->frame_height, ci->frame_stride, ci->frame_offset,
b->rgb_width, b->rgb_height, b->rgb_stride,
ci->bayer_flip, ci->hdr, s->camera_num);
ci->bayer_flip, ci->hdr, s->camera_num, s->camera_num==1 ? " -DVIGNETTING" : "");
const char *cl_file = "cameras/real_debayer.cl";
cl_program prg_debayer = cl_program_from_file(context, device_id, cl_file, args);
krnl_ = CL_CHECK_ERR(clCreateKernel(prg_debayer, "debayer10", &err));
CL_CHECK(clReleaseProgram(prg_debayer));
}
void queue(cl_command_queue q, cl_mem cam_buf_cl, cl_mem buf_cl, int width, int height, float gain, float black_level, cl_event *debayer_event) {
void queue(cl_command_queue q, cl_mem cam_buf_cl, cl_mem buf_cl, int width, int height, cl_event *debayer_event) {
CL_CHECK(clSetKernelArg(krnl_, 0, sizeof(cl_mem), &cam_buf_cl));
CL_CHECK(clSetKernelArg(krnl_, 1, sizeof(cl_mem), &buf_cl));
const size_t globalWorkSize[] = {size_t(width / 2), size_t(height / 2)};
const int debayer_local_worksize = 16;
constexpr int localMemSize = (debayer_local_worksize * 2 + 2) * (debayer_local_worksize * 2 + 2) * 2;
const size_t localWorkSize[] = {debayer_local_worksize, debayer_local_worksize};
CL_CHECK(clSetKernelArg(krnl_, 2, localMemSize, 0));
CL_CHECK(clSetKernelArg(krnl_, 3, sizeof(float), &black_level));
CL_CHECK(clEnqueueNDRangeKernel(q, krnl_, 2, NULL, globalWorkSize, localWorkSize, 0, 0, debayer_event));
}
@ -150,15 +147,7 @@ bool CameraBuf::acquire() {
cur_camera_buf = &camera_bufs[cur_buf_idx];
if (debayer) {
float gain = 0.0;
float black_level = 42.0;
#ifndef QCOM2
gain = camera_state->digital_gain;
if ((int)gain == 0) gain = 1.0;
#else
if (camera_state->camera_id == CAMERA_ID_IMX390) black_level = 64.0;
#endif
debayer->queue(q, camrabuf_cl, cur_yuv_buf->buf_cl, rgb_width, rgb_height, gain, black_level, &event);
debayer->queue(q, camrabuf_cl, cur_yuv_buf->buf_cl, rgb_width, rgb_height, &event);
} else {
assert(rgb_stride == camera_state->ci.frame_stride);
rgb2yuv->queue(q, camrabuf_cl, cur_rgb_buf->buf_cl);

@ -1,12 +1,3 @@
#ifdef HALF_AS_FLOAT
#define half float
#define half2 float2
#define half3 float3
#define half4 float4
#else
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
#endif
#define UV_WIDTH RGB_WIDTH / 2
#define UV_HEIGHT RGB_HEIGHT / 2
#define U_OFFSET RGB_WIDTH * RGB_HEIGHT
@ -17,171 +8,129 @@
#define RGB_TO_V(r, g, b) ((mul24(r, 56) - mul24(g, 47) - mul24(b, 9) + 0x8080) >> 8)
#define AVERAGE(x, y, z, w) ((convert_ushort(x) + convert_ushort(y) + convert_ushort(z) + convert_ushort(w) + 1) >> 1)
// post wb CCM
const __constant half3 color_correction_0 = (half3)(1.82717181, -0.31231438, 0.07307673);
const __constant half3 color_correction_1 = (half3)(-0.5743977, 1.36858544, -0.53183455);
const __constant half3 color_correction_2 = (half3)(-0.25277411, -0.05627105, 1.45875782);
float3 color_correct(float3 rgb) {
// color correction
float3 x = rgb.x * (float3)(1.82717181, -0.31231438, 0.07307673);
x += rgb.y * (float3)(-0.5743977, 1.36858544, -0.53183455);
x += rgb.z * (float3)(-0.25277411, -0.05627105, 1.45875782);
// tone mapping params
const half gamma_k = 0.75;
const half gamma_b = 0.125;
const half mp = 0.01; // ideally midpoint should be adaptive
const half rk = 9 - 100*mp;
// tone mapping params
const float gamma_k = 0.75;
const float gamma_b = 0.125;
const float mp = 0.01; // ideally midpoint should be adaptive
const float rk = 9 - 100*mp;
inline half3 gamma_apply(half3 x) {
// poly approximation for s curve
return (x > mp) ?
((rk * (x-mp) * (1-(gamma_k*mp+gamma_b)) * (1+1/(rk*(1-mp))) / (1+rk*(x-mp))) + gamma_k*mp + gamma_b) :
((rk * (x-mp) * (gamma_k*mp+gamma_b) * (1+1/(rk*mp)) / (1-rk*(x-mp))) + gamma_k*mp + gamma_b);
}
inline half3 color_correct(half3 rgb) {
half3 ret = (half)rgb.x * color_correction_0;
ret += (half)rgb.y * color_correction_1;
ret += (half)rgb.z * color_correction_2;
return gamma_apply(ret);
}
inline half get_vignetting_s(float r) {
float get_vignetting_s(float r) {
if (r < 62500) {
return (half)(1.0f + 0.0000008f*r);
return (1.0f + 0.0000008f*r);
} else if (r < 490000) {
return (half)(0.9625f + 0.0000014f*r);
return (0.9625f + 0.0000014f*r);
} else if (r < 1102500) {
return (half)(1.26434f + 0.0000000000016f*r*r);
return (1.26434f + 0.0000000000016f*r*r);
} else {
return (half)(0.53503625f + 0.0000000000022f*r*r);
return (0.53503625f + 0.0000000000022f*r*r);
}
}
inline half val_from_10(const uchar * source, int gx, int gy, half black_level) {
// parse 12bit
int start = gy * FRAME_STRIDE + (3 * (gx / 2)) + (FRAME_STRIDE * FRAME_OFFSET);
int offset = gx % 2;
uint major = (uint)source[start + offset] << 4;
uint minor = (source[start + 2] >> (4 * offset)) & 0xf;
half pv = ((half)(major + minor)) / 4.0;
// normalize
pv = max((half)0.0, pv - black_level);
pv /= (1024.0 - black_level);
// correct vignetting
if (CAM_NUM == 1) { // fcamera
gx = (gx - RGB_WIDTH/2);
gy = (gy - RGB_HEIGHT/2);
pv *= get_vignetting_s(gx*gx + gy*gy);
}
pv = clamp(pv, (half)0.0, (half)1.0);
return pv;
float4 val4_from_12(uchar8 pvs, float gain) {
uint4 parsed = (uint4)(((uint)pvs.s0<<4) + (pvs.s1>>4), // is from the previous 10 bit
((uint)pvs.s2<<4) + (pvs.s4&0xF),
((uint)pvs.s3<<4) + (pvs.s4>>4),
((uint)pvs.s5<<4) + (pvs.s7&0xF));
// normalize and scale
float4 pv = (convert_float4(parsed) - 168.0) / (4096.0 - 168.0);
return clamp(pv*gain, 0.0, 1.0);
}
inline half get_k(half a, half b, half c, half d) {
float get_k(float a, float b, float c, float d) {
return 2.0 - (fabs(a - b) + fabs(c - d));
}
__kernel void debayer10(const __global uchar * in,
__global uchar * out,
__local half * cached,
float black_level
)
__kernel void debayer10(const __global uchar * in, __global uchar * out)
{
const int gid_x = get_global_id(0);
const int gid_y = get_global_id(1);
const int lid_x = get_local_id(0);
const int lid_y = get_local_id(1);
const int localRowLen = mad24(get_local_size(0), 2, 2); // 2 padding
const int localColLen = mad24(get_local_size(1), 2, 2);
const int x_global = mul24(gid_x, 2);
const int y_global = mul24(gid_y, 2);
const int y_top_mod = (gid_y == 0) ? 2: 0;
const int y_bot_mod = (gid_y == (RGB_HEIGHT/2 - 1)) ? 1: 3;
const int x_local = mad24(lid_x, 2, 1);
const int y_local = mad24(lid_y, 2, 1);
const int x_global_mod = (gid_x == 0 || gid_x == get_global_size(0) - 1) ? -1: 1;
const int y_global_mod = (gid_y == 0 || gid_y == get_global_size(1) - 1) ? -1: 1;
float3 rgb;
uchar3 rgb_out[4];
int localColOffset = 0;
int globalColOffset;
int start = (2 * gid_y - 1) * FRAME_STRIDE + (3 * gid_x - 2) + (FRAME_STRIDE * FRAME_OFFSET);
cached[mad24(y_local + 0, localRowLen, x_local + 0)] = val_from_10(in, x_global + 0, y_global + 0, black_level);
cached[mad24(y_local + 0, localRowLen, x_local + 1)] = val_from_10(in, x_global + 1, y_global + 0, black_level);
cached[mad24(y_local + 1, localRowLen, x_local + 0)] = val_from_10(in, x_global + 0, y_global + 1, black_level);
cached[mad24(y_local + 1, localRowLen, x_local + 1)] = val_from_10(in, x_global + 1, y_global + 1, black_level);
// read in 8x4 chars
uchar8 dat[4];
dat[0] = vload8(0, in + start + FRAME_STRIDE*y_top_mod);
dat[1] = vload8(0, in + start + FRAME_STRIDE*1);
dat[2] = vload8(0, in + start + FRAME_STRIDE*2);
dat[3] = vload8(0, in + start + FRAME_STRIDE*y_bot_mod);
if (lid_x == 0) { // left edge
localColOffset = -1;
globalColOffset = -x_global_mod;
cached[mad24(y_local + 0, localRowLen, x_local - 1)] = val_from_10(in, x_global - x_global_mod, y_global + 0, black_level);
cached[mad24(y_local + 1, localRowLen, x_local - 1)] = val_from_10(in, x_global - x_global_mod, y_global + 1, black_level);
} else if (lid_x == get_local_size(0) - 1) { // right edge
localColOffset = 2;
globalColOffset = x_global_mod + 1;
cached[mad24(y_local + 0, localRowLen, x_local + 2)] = val_from_10(in, x_global + x_global_mod + 1, y_global + 0, black_level);
cached[mad24(y_local + 1, localRowLen, x_local + 2)] = val_from_10(in, x_global + x_global_mod + 1, y_global + 1, black_level);
}
if (lid_y == 0) { // top row
cached[mad24(y_local - 1, localRowLen, x_local + 0)] = val_from_10(in, x_global + 0, y_global - y_global_mod, black_level);
cached[mad24(y_local - 1, localRowLen, x_local + 1)] = val_from_10(in, x_global + 1, y_global - y_global_mod, black_level);
if (localColOffset != 0) { // cache corners
cached[mad24(y_local - 1, localRowLen, x_local + localColOffset)] = val_from_10(in, x_global + globalColOffset, y_global - y_global_mod, black_level);
}
} else if (lid_y == get_local_size(1) - 1) { // bottom row
cached[mad24(y_local + 2, localRowLen, x_local + 0)] = val_from_10(in, x_global + 0, y_global + y_global_mod + 1, black_level);
cached[mad24(y_local + 2, localRowLen, x_local + 1)] = val_from_10(in, x_global + 1, y_global + y_global_mod + 1, black_level);
if (localColOffset != 0) { // cache corners
cached[mad24(y_local + 2, localRowLen, x_local + localColOffset)] = val_from_10(in, x_global + globalColOffset, y_global + y_global_mod + 1, black_level);
}
// correct vignetting
#if VIGNETTING
int gx = (gid_x*2 - RGB_WIDTH/2);
int gy = (gid_y*2 - RGB_HEIGHT/2);
const float gain = get_vignetting_s(gx*gx + gy*gy);
#else
const float gain = 1.0;
#endif
// process them to floats
float4 va = val4_from_12(dat[0], gain);
float4 vb = val4_from_12(dat[1], gain);
float4 vc = val4_from_12(dat[2], gain);
float4 vd = val4_from_12(dat[3], gain);
if (gid_x == 0) {
va.s0 = va.s2;
vb.s0 = vb.s2;
vc.s0 = vc.s2;
vd.s0 = vd.s2;
} else if (gid_x == RGB_WIDTH/2 - 1) {
va.s3 = va.s1;
vb.s3 = vb.s1;
vc.s3 = vc.s1;
vd.s3 = vd.s1;
}
// sync
barrier(CLK_LOCAL_MEM_FENCE);
half3 rgb;
uchar3 rgb_out[4];
const half4 va = vload4(0, cached + mad24(lid_y * 2 + 0, localRowLen, lid_x * 2));
const half4 vb = vload4(0, cached + mad24(lid_y * 2 + 1, localRowLen, lid_x * 2));
const half4 vc = vload4(0, cached + mad24(lid_y * 2 + 2, localRowLen, lid_x * 2));
const half4 vd = vload4(0, cached + mad24(lid_y * 2 + 3, localRowLen, lid_x * 2));
// a simplified version of https://opensignalprocessingjournal.com/contents/volumes/V6/TOSIGPJ-6-1/TOSIGPJ-6-1.pdf
const half k01 = get_k(va.s0, vb.s1, va.s2, vb.s1);
const half k02 = get_k(va.s2, vb.s1, vc.s2, vb.s1);
const half k03 = get_k(vc.s0, vb.s1, vc.s2, vb.s1);
const half k04 = get_k(va.s0, vb.s1, vc.s0, vb.s1);
const float k01 = get_k(va.s0, vb.s1, va.s2, vb.s1);
const float k02 = get_k(va.s2, vb.s1, vc.s2, vb.s1);
const float k03 = get_k(vc.s0, vb.s1, vc.s2, vb.s1);
const float k04 = get_k(va.s0, vb.s1, vc.s0, vb.s1);
rgb.x = (k02*vb.s2+k04*vb.s0)/(k02+k04); // R_G1
rgb.y = vb.s1; // G1(R)
rgb.z = (k01*va.s1+k03*vc.s1)/(k01+k03); // B_G1
rgb_out[0] = convert_uchar3_sat(color_correct(clamp(rgb, 0.0, 1.0)) * 255.0);
const half k11 = get_k(va.s1, vc.s1, va.s3, vc.s3);
const half k12 = get_k(va.s2, vb.s1, vb.s3, vc.s2);
const half k13 = get_k(va.s1, va.s3, vc.s1, vc.s3);
const half k14 = get_k(va.s2, vb.s3, vc.s2, vb.s1);
const float k11 = get_k(va.s1, vc.s1, va.s3, vc.s3);
const float k12 = get_k(va.s2, vb.s1, vb.s3, vc.s2);
const float k13 = get_k(va.s1, va.s3, vc.s1, vc.s3);
const float k14 = get_k(va.s2, vb.s3, vc.s2, vb.s1);
rgb.x = vb.s2; // R
rgb.y = (k11*(va.s2+vc.s2)*0.5+k13*(vb.s3+vb.s1)*0.5)/(k11+k13); // G_R
rgb.z = (k12*(va.s3+vc.s1)*0.5+k14*(va.s1+vc.s3)*0.5)/(k12+k14); // B_R
rgb_out[1] = convert_uchar3_sat(color_correct(clamp(rgb, 0.0, 1.0)) * 255.0);
const half k21 = get_k(vb.s0, vd.s0, vb.s2, vd.s2);
const half k22 = get_k(vb.s1, vc.s0, vc.s2, vd.s1);
const half k23 = get_k(vb.s0, vb.s2, vd.s0, vd.s2);
const half k24 = get_k(vb.s1, vc.s2, vd.s1, vc.s0);
const float k21 = get_k(vb.s0, vd.s0, vb.s2, vd.s2);
const float k22 = get_k(vb.s1, vc.s0, vc.s2, vd.s1);
const float k23 = get_k(vb.s0, vb.s2, vd.s0, vd.s2);
const float k24 = get_k(vb.s1, vc.s2, vd.s1, vc.s0);
rgb.x = (k22*(vb.s2+vd.s0)*0.5+k24*(vb.s0+vd.s2)*0.5)/(k22+k24); // R_B
rgb.y = (k21*(vb.s1+vd.s1)*0.5+k23*(vc.s2+vc.s0)*0.5)/(k21+k23); // G_B
rgb.z = vc.s1; // B
rgb_out[2] = convert_uchar3_sat(color_correct(clamp(rgb, 0.0, 1.0)) * 255.0);
const half k31 = get_k(vb.s1, vc.s2, vb.s3, vc.s2);
const half k32 = get_k(vb.s3, vc.s2, vd.s3, vc.s2);
const half k33 = get_k(vd.s1, vc.s2, vd.s3, vc.s2);
const half k34 = get_k(vb.s1, vc.s2, vd.s1, vc.s2);
const float k31 = get_k(vb.s1, vc.s2, vb.s3, vc.s2);
const float k32 = get_k(vb.s3, vc.s2, vd.s3, vc.s2);
const float k33 = get_k(vd.s1, vc.s2, vd.s3, vc.s2);
const float k34 = get_k(vb.s1, vc.s2, vd.s1, vc.s2);
rgb.x = (k31*vb.s2+k33*vd.s2)/(k31+k33); // R_G2
rgb.y = vc.s2; // G2(B)
rgb.z = (k32*vc.s3+k34*vc.s1)/(k32+k34); // B_G2

@ -19,7 +19,7 @@ class Proc:
warmup: float = 3.
PROCS = [
Proc('camerad', 2.17),
Proc('camerad', 2.02),
Proc('modeld', 0.95),
Proc('dmonitoringmodeld', 0.25),
Proc('encoderd', 0.42),

@ -80,8 +80,7 @@ def debayer_frame(ctx, debayer_prg, data, rgb=False):
yuv_g = cl.Buffer(ctx, cl.mem_flags.WRITE_ONLY, FRAME_WIDTH * FRAME_HEIGHT + UV_SIZE * 2)
local_worksize = (20, 20) if TICI else (4, 4)
local_mem = cl.LocalMemory(3528 if TICI else 400)
ev1 = debayer_prg.debayer10(q, (UV_WIDTH, UV_HEIGHT), local_worksize, cam_g, yuv_g, local_mem, np.float32(42))
ev1 = debayer_prg.debayer10(q, (UV_WIDTH, UV_HEIGHT), local_worksize, cam_g, yuv_g)
cl.enqueue_copy(q, yuv_buff, yuv_g, wait_for=[ev1]).wait()
cl.enqueue_barrier(q)

Loading…
Cancel
Save