Revert "remove local caching"

This reverts commit 51d441ad78.
pull/24557/head
Comma Device 3 years ago
parent 51d441ad78
commit 5612c4f22c
  1. 74
      selfdrive/camerad/cameras/real_debayer.cl

@ -83,36 +83,72 @@ inline half get_k(half a, half b, half c, half d) {
__kernel void debayer10(const __global uchar * in,
__global uchar * out,
__local half * cached,
float black_level
)
{
const int gid_x = get_global_id(0);
const int gid_y = get_global_id(1);
half3 rgb;
uchar3 rgb_out[4];
const int lid_x = get_local_id(0);
const int lid_y = get_local_id(1);
const int localRowLen = mad24(get_local_size(0), 2, 2); // 2 padding
const int localColLen = mad24(get_local_size(1), 2, 2);
const int x_global = mul24(gid_x, 2);
const int y_global = mul24(gid_y, 2);
const int x_local = mad24(lid_x, 2, 1);
const int y_local = mad24(lid_y, 2, 1);
const int x_global_mod = (gid_x == 0 || gid_x == get_global_size(0) - 1) ? -1: 1;
const int y_global_mod = (gid_y == 0 || gid_y == get_global_size(1) - 1) ? -1: 1;
int localColOffset = 0;
int globalColOffset;
half4 va, vb, vc, vd;
cached[mad24(y_local + 0, localRowLen, x_local + 0)] = val_from_10(in, x_global + 0, y_global + 0, black_level);
cached[mad24(y_local + 0, localRowLen, x_local + 1)] = val_from_10(in, x_global + 1, y_global + 0, black_level);
cached[mad24(y_local + 1, localRowLen, x_local + 0)] = val_from_10(in, x_global + 0, y_global + 1, black_level);
cached[mad24(y_local + 1, localRowLen, x_local + 1)] = val_from_10(in, x_global + 1, y_global + 1, black_level);
va.s0 = val_from_10(in, gid_x*2-1, gid_y*2-1, black_level);
va.s1 = val_from_10(in, gid_x*2+0, gid_y*2-1, black_level);
va.s2 = val_from_10(in, gid_x*2+1, gid_y*2-1, black_level);
va.s3 = val_from_10(in, gid_x*2+2, gid_y*2-1, black_level);
if (lid_x == 0) { // left edge
localColOffset = -1;
globalColOffset = -x_global_mod;
cached[mad24(y_local + 0, localRowLen, x_local - 1)] = val_from_10(in, x_global - x_global_mod, y_global + 0, black_level);
cached[mad24(y_local + 1, localRowLen, x_local - 1)] = val_from_10(in, x_global - x_global_mod, y_global + 1, black_level);
} else if (lid_x == get_local_size(0) - 1) { // right edge
localColOffset = 2;
globalColOffset = x_global_mod + 1;
cached[mad24(y_local + 0, localRowLen, x_local + 2)] = val_from_10(in, x_global + x_global_mod + 1, y_global + 0, black_level);
cached[mad24(y_local + 1, localRowLen, x_local + 2)] = val_from_10(in, x_global + x_global_mod + 1, y_global + 1, black_level);
}
vb.s0 = val_from_10(in, gid_x*2-1, gid_y*2+0, black_level);
vb.s1 = val_from_10(in, gid_x*2+0, gid_y*2+0, black_level); // G(R)
vb.s2 = val_from_10(in, gid_x*2+1, gid_y*2+0, black_level); // R
vb.s3 = val_from_10(in, gid_x*2+2, gid_y*2+0, black_level);
if (lid_y == 0) { // top row
cached[mad24(y_local - 1, localRowLen, x_local + 0)] = val_from_10(in, x_global + 0, y_global - y_global_mod, black_level);
cached[mad24(y_local - 1, localRowLen, x_local + 1)] = val_from_10(in, x_global + 1, y_global - y_global_mod, black_level);
if (localColOffset != 0) { // cache corners
cached[mad24(y_local - 1, localRowLen, x_local + localColOffset)] = val_from_10(in, x_global + globalColOffset, y_global - y_global_mod, black_level);
}
} else if (lid_y == get_local_size(1) - 1) { // bottom row
cached[mad24(y_local + 2, localRowLen, x_local + 0)] = val_from_10(in, x_global + 0, y_global + y_global_mod + 1, black_level);
cached[mad24(y_local + 2, localRowLen, x_local + 1)] = val_from_10(in, x_global + 1, y_global + y_global_mod + 1, black_level);
if (localColOffset != 0) { // cache corners
cached[mad24(y_local + 2, localRowLen, x_local + localColOffset)] = val_from_10(in, x_global + globalColOffset, y_global + y_global_mod + 1, black_level);
}
}
vc.s0 = val_from_10(in, gid_x*2-1, gid_y*2+1, black_level);
vc.s1 = val_from_10(in, gid_x*2+0, gid_y*2+1, black_level); // B
vc.s2 = val_from_10(in, gid_x*2+1, gid_y*2+1, black_level); // G(B)
vc.s3 = val_from_10(in, gid_x*2+2, gid_y*2+1, black_level);
// sync
barrier(CLK_LOCAL_MEM_FENCE);
half3 rgb;
uchar3 rgb_out[4];
vd.s0 = val_from_10(in, gid_x*2-1, gid_y*2+2, black_level);
vd.s1 = val_from_10(in, gid_x*2+0, gid_y*2+2, black_level);
vd.s2 = val_from_10(in, gid_x*2+1, gid_y*2+2, black_level);
vd.s3 = val_from_10(in, gid_x*2+2, gid_y*2+2, black_level);
const half4 va = vload4(0, cached + mad24(lid_y * 2 + 0, localRowLen, lid_x * 2));
const half4 vb = vload4(0, cached + mad24(lid_y * 2 + 1, localRowLen, lid_x * 2));
const half4 vc = vload4(0, cached + mad24(lid_y * 2 + 2, localRowLen, lid_x * 2));
const half4 vd = vload4(0, cached + mad24(lid_y * 2 + 3, localRowLen, lid_x * 2));
// a simplified version of https://opensignalprocessingjournal.com/contents/volumes/V6/TOSIGPJ-6-1/TOSIGPJ-6-1.pdf
const half k01 = get_k(va.s0, vb.s1, va.s2, vb.s1);

Loading…
Cancel
Save