From aada0e9a66bfb8fe6c05bfbf3b058baef3bd3881 Mon Sep 17 00:00:00 2001
From: ZwX1616 <zwx1616@gmail.com>
Date: Sat, 2 May 2020 21:40:43 -0700
Subject: [PATCH] bad AF state logging (#1365)

old-commit-hash: 968e2585cc7e2a580fa351a75ceb02c1fe02c409
---
 release/files_common                    |   5 +
 selfdrive/camerad/SConscript            |   1 +
 selfdrive/camerad/cameras/camera_qcom.c |   2 +
 selfdrive/camerad/imgproc/conv.cl       | 110 +++++++++++++++++++
 selfdrive/camerad/imgproc/pool.cl       |  34 ++++++
 selfdrive/camerad/imgproc/utils.cc      |  44 ++++++++
 selfdrive/camerad/imgproc/utils.h       |  30 ++++++
 selfdrive/camerad/main.cc               | 136 +++++++++++++++++++++++-
 8 files changed, 358 insertions(+), 4 deletions(-)
 create mode 100644 selfdrive/camerad/imgproc/conv.cl
 create mode 100644 selfdrive/camerad/imgproc/pool.cl
 create mode 100644 selfdrive/camerad/imgproc/utils.cc
 create mode 100644 selfdrive/camerad/imgproc/utils.h

diff --git a/release/files_common b/release/files_common
index 4a666b6b0b..b2791c4d46 100644
--- a/release/files_common
+++ b/release/files_common
@@ -355,6 +355,11 @@ selfdrive/camerad/transforms/rgb_to_yuv.h
 selfdrive/camerad/transforms/rgb_to_yuv.cl
 selfdrive/camerad/transforms/rgb_to_yuv_test.cc
 
+selfdrive/camerad/imgproc/conv.cl
+selfdrive/camerad/imgproc/pool.cl
+selfdrive/camerad/imgproc/utils.cc
+selfdrive/camerad/imgproc/utils.h
+
 selfdrive/modeld/SConscript
 selfdrive/modeld/modeld.cc
 selfdrive/modeld/dmonitoringmodeld.cc
diff --git a/selfdrive/camerad/SConscript b/selfdrive/camerad/SConscript
index 374ae139c5..405be689f1 100644
--- a/selfdrive/camerad/SConscript
+++ b/selfdrive/camerad/SConscript
@@ -26,5 +26,6 @@ env.SharedLibrary('snapshot/visionipc',
 env.Program('camerad', [
     'main.cc',
     'transforms/rgb_to_yuv.c',
+    'imgproc/utils.cc',
     cameras,
   ], LIBS=libs)
diff --git a/selfdrive/camerad/cameras/camera_qcom.c b/selfdrive/camerad/cameras/camera_qcom.c
index 1d1396d7bf..b7b951f9ca 100644
--- a/selfdrive/camerad/cameras/camera_qcom.c
+++ b/selfdrive/camerad/cameras/camera_qcom.c
@@ -1719,6 +1719,8 @@ static void parse_autofocus(CameraState *s, uint8_t *d) {
   for (int i = 0; i < NUM_FOCUS; i++) {
     int doff = i*5+5;
     s->confidence[i] = d[doff];
+    // this should just be a 10-bit signed int instead of 11
+    // TODO: write it in a nicer way
     int16_t focus_t = (d[doff+1] << 3) | (d[doff+2] >> 5);
     if (focus_t >= 1024) focus_t = -(2048-focus_t);
     s->focus[i] = focus_t;
diff --git a/selfdrive/camerad/imgproc/conv.cl b/selfdrive/camerad/imgproc/conv.cl
new file mode 100644
index 0000000000..f92d356705
--- /dev/null
+++ b/selfdrive/camerad/imgproc/conv.cl
@@ -0,0 +1,110 @@
+// const __constant float3 rgb_weights = (0.299, 0.587, 0.114); // opencv rgb2gray weights
+// const __constant float3 bgr_weights = (0.114, 0.587, 0.299); // bgr2gray weights
+
+// convert input rgb image to single channel then conv
+__kernel void rgb2gray_conv2d(
+  const __global uchar * input, 
+  __global short * output,
+  __constant short * filter,
+  __local uchar3 * cached
+)
+{
+  const int rowOffset = get_global_id(1) * IMAGE_W;
+  const int my = get_global_id(0) + rowOffset;
+
+  const int localRowLen = TWICE_HALF_FILTER_SIZE + get_local_size(0);
+  const int localRowOffset = ( get_local_id(1) + HALF_FILTER_SIZE ) * localRowLen;
+  const int myLocal = localRowOffset + get_local_id(0) + HALF_FILTER_SIZE;
+
+  // cache local pixels
+  cached[ myLocal ].x = input[ my * 3 ]; // r
+  cached[ myLocal ].y = input[ my * 3 + 1]; // g
+  cached[ myLocal ].z = input[ my * 3 + 2]; // b
+
+  // pad
+  if (
+    get_global_id(0) < HALF_FILTER_SIZE       || 
+    get_global_id(0) > IMAGE_W - HALF_FILTER_SIZE - 1   || 
+    get_global_id(1) < HALF_FILTER_SIZE     ||
+    get_global_id(1) > IMAGE_H - HALF_FILTER_SIZE - 1
+  )
+  {
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return;
+  }
+  else 
+  {
+    int localColOffset = -1;
+    int globalColOffset = -1;
+    
+    // cache extra
+    if ( get_local_id(0) < HALF_FILTER_SIZE )
+    {
+      localColOffset = get_local_id(0);
+      globalColOffset = -HALF_FILTER_SIZE;
+
+      cached[ localRowOffset + get_local_id(0) ].x = input[ my * 3 - HALF_FILTER_SIZE * 3 ];
+      cached[ localRowOffset + get_local_id(0) ].y = input[ my * 3 - HALF_FILTER_SIZE * 3 + 1];
+      cached[ localRowOffset + get_local_id(0) ].z = input[ my * 3 - HALF_FILTER_SIZE * 3 + 2];
+    }
+    else if ( get_local_id(0) >= get_local_size(0) - HALF_FILTER_SIZE )
+    {
+      localColOffset = get_local_id(0) + TWICE_HALF_FILTER_SIZE;
+      globalColOffset = HALF_FILTER_SIZE;
+      
+      cached[ myLocal + HALF_FILTER_SIZE ].x = input[ my * 3 + HALF_FILTER_SIZE * 3 ];
+      cached[ myLocal + HALF_FILTER_SIZE ].y = input[ my * 3 + HALF_FILTER_SIZE * 3 + 1];
+      cached[ myLocal + HALF_FILTER_SIZE ].z = input[ my * 3 + HALF_FILTER_SIZE * 3 + 2];
+    }
+
+
+    if ( get_local_id(1) < HALF_FILTER_SIZE )
+    {
+      cached[ get_local_id(1) * localRowLen + get_local_id(0) + HALF_FILTER_SIZE ].x = input[ my * 3 - HALF_FILTER_SIZE_IMAGE_W * 3 ];
+      cached[ get_local_id(1) * localRowLen + get_local_id(0) + HALF_FILTER_SIZE ].y = input[ my * 3 - HALF_FILTER_SIZE_IMAGE_W * 3 + 1];
+      cached[ get_local_id(1) * localRowLen + get_local_id(0) + HALF_FILTER_SIZE ].z = input[ my * 3 - HALF_FILTER_SIZE_IMAGE_W * 3 + 2];
+      if (localColOffset > 0)
+      {
+        cached[ get_local_id(1) * localRowLen + localColOffset ].x = input[ my * 3 - HALF_FILTER_SIZE_IMAGE_W * 3 + globalColOffset * 3];
+        cached[ get_local_id(1) * localRowLen + localColOffset ].y = input[ my * 3 - HALF_FILTER_SIZE_IMAGE_W * 3 + globalColOffset * 3 + 1];
+        cached[ get_local_id(1) * localRowLen + localColOffset ].z = input[ my * 3 - HALF_FILTER_SIZE_IMAGE_W * 3 + globalColOffset * 3 + 2];
+      }
+    }
+    else if ( get_local_id(1) >= get_local_size(1) -HALF_FILTER_SIZE )
+    {
+      int offset = ( get_local_id(1) + TWICE_HALF_FILTER_SIZE ) * localRowLen;
+      cached[ offset + get_local_id(0) + HALF_FILTER_SIZE ].x = input[ my * 3 + HALF_FILTER_SIZE_IMAGE_W * 3 ];
+      cached[ offset + get_local_id(0) + HALF_FILTER_SIZE ].y = input[ my * 3 + HALF_FILTER_SIZE_IMAGE_W * 3 + 1];
+      cached[ offset + get_local_id(0) + HALF_FILTER_SIZE ].z = input[ my * 3 + HALF_FILTER_SIZE_IMAGE_W * 3 + 2];
+      if (localColOffset > 0)
+      {
+        cached[ offset + localColOffset ].x = input[ my * 3 + HALF_FILTER_SIZE_IMAGE_W * 3 + globalColOffset * 3];
+        cached[ offset + localColOffset ].y = input[ my * 3 + HALF_FILTER_SIZE_IMAGE_W * 3 + globalColOffset * 3 + 1];
+        cached[ offset + localColOffset ].z = input[ my * 3 + HALF_FILTER_SIZE_IMAGE_W * 3 + globalColOffset * 3 + 2];
+      }
+    }
+
+    // sync
+    barrier(CLK_LOCAL_MEM_FENCE);
+
+    // perform convolution
+    int fIndex = 0;
+    short sum = 0;
+
+    for (int r = -HALF_FILTER_SIZE; r <= HALF_FILTER_SIZE; r++)
+    {
+      int curRow = r * localRowLen;
+      for (int c = -HALF_FILTER_SIZE; c <= HALF_FILTER_SIZE; c++, fIndex++)
+      {
+        if (!FLIP_RB){
+          // sum += dot(rgb_weights, cached[ myLocal + curRow + c ]) * filter[ fIndex ];
+          sum += (cached[ myLocal + curRow + c ].x / 3 + cached[ myLocal + curRow + c ].y / 2 + cached[ myLocal + curRow + c ].z / 9) * filter[ fIndex ];
+        } else {
+          // sum += dot(bgr_weights, cached[ myLocal + curRow + c ]) * filter[ fIndex ];
+          sum += (cached[ myLocal + curRow + c ].x / 9 + cached[ myLocal + curRow + c ].y / 2 + cached[ myLocal + curRow + c ].z / 3) * filter[ fIndex ];
+        }
+      }
+    }
+    output[my] = sum;
+  }
+}
\ No newline at end of file
diff --git a/selfdrive/camerad/imgproc/pool.cl b/selfdrive/camerad/imgproc/pool.cl
new file mode 100644
index 0000000000..3ba86ae24e
--- /dev/null
+++ b/selfdrive/camerad/imgproc/pool.cl
@@ -0,0 +1,34 @@
+// calculate variance in each subregion
+__kernel void var_pool(
+  const __global char * input, 
+  __global ushort * output // should not be larger than 128*128 so uint16
+)
+{
+  const int xidx = get_global_id(0) + ROI_X_MIN;
+  const int yidx = get_global_id(1) + ROI_Y_MIN;
+
+  const int size = X_PITCH * Y_PITCH;
+
+  float fsum = 0;
+  char mean, max;
+  
+  for (int i = 0; i < size; i++) {
+    int x_offset = i % X_PITCH;
+    int y_offset = i / X_PITCH;
+    fsum += input[xidx*X_PITCH + yidx*Y_PITCH*FULL_STRIDE_X + x_offset + y_offset*FULL_STRIDE_X];
+    max = input[xidx*X_PITCH + yidx*Y_PITCH*FULL_STRIDE_X + x_offset + y_offset*FULL_STRIDE_X]>max ? input[xidx*X_PITCH + yidx*Y_PITCH*FULL_STRIDE_X + x_offset + y_offset*FULL_STRIDE_X]:max;
+  }
+
+  mean = convert_char_rte(fsum / size);
+
+  float fvar = 0;
+  for (int i = 0; i < size; i++) {
+    int x_offset = i % X_PITCH;
+    int y_offset = i / X_PITCH;
+    fvar += (input[xidx*X_PITCH + yidx*Y_PITCH*FULL_STRIDE_X + x_offset + y_offset*FULL_STRIDE_X] - mean) * (input[xidx*X_PITCH + yidx*Y_PITCH*FULL_STRIDE_X + x_offset + y_offset*FULL_STRIDE_X] - mean);
+  }
+
+  fvar = fvar / size;
+
+  output[(xidx-ROI_X_MIN)+(yidx-ROI_Y_MIN)*(ROI_X_MAX-ROI_X_MIN+1)] = convert_ushort_rte(5 * fvar + convert_float_rte(max));
+}
\ No newline at end of file
diff --git a/selfdrive/camerad/imgproc/utils.cc b/selfdrive/camerad/imgproc/utils.cc
new file mode 100644
index 0000000000..0b36d68637
--- /dev/null
+++ b/selfdrive/camerad/imgproc/utils.cc
@@ -0,0 +1,44 @@
+#include "utils.h"
+#include <stdio.h>
+#include <algorithm>
+
+// calculate score based on laplacians in one area
+void get_lapmap_one(int16_t *lap, uint16_t *res, int x_pitch, int y_pitch) {
+  int size = x_pitch * y_pitch;
+  // avg and max of roi
+  float fsum = 0;
+  int16_t mean, max;
+  max = 0;
+
+  for (int i = 0; i < size; i++) {
+    int x_offset = i % x_pitch;
+    int y_offset = i / x_pitch;
+    fsum += lap[x_offset + y_offset*x_pitch];
+    max = std::max(lap[x_offset + y_offset*x_pitch], max);
+  }
+
+  mean = fsum / size;
+
+  // var of roi
+  float fvar = 0;
+  for (int i = 0; i < size; i++) {
+    int x_offset = i % x_pitch;
+    int y_offset = i / x_pitch;
+    fvar += (float)((lap[x_offset + y_offset*x_pitch] - mean) * (lap[x_offset + y_offset*x_pitch] - mean));
+  }
+
+  fvar = fvar / size;
+
+  *res = std::min(5 * fvar + max, (float)65535);
+}
+
+bool is_blur(uint16_t *lapmap) {
+  int n_roi = (ROI_X_MAX - ROI_X_MIN + 1) * (ROI_Y_MAX - ROI_Y_MIN + 1);
+  float bad_sum = 0;
+  for (int i = 0; i < n_roi; i++) {
+    if (*(lapmap + i) < LM_THRESH) {
+      bad_sum += 1/(float)n_roi;
+    }
+  }
+  return (bad_sum > LM_PREC_THRESH);
+}
\ No newline at end of file
diff --git a/selfdrive/camerad/imgproc/utils.h b/selfdrive/camerad/imgproc/utils.h
new file mode 100644
index 0000000000..f77a456912
--- /dev/null
+++ b/selfdrive/camerad/imgproc/utils.h
@@ -0,0 +1,30 @@
+#ifndef IMGPROC_UTILS
+#define IMGPROC_UTILS
+
+#include <stdint.h>
+
+#define NUM_SEGMENTS_X 8
+#define NUM_SEGMENTS_Y 6
+
+#define ROI_X_MIN 1
+#define ROI_X_MAX 6
+#define ROI_Y_MIN 2
+#define ROI_Y_MAX 3
+
+#define LM_THRESH 222
+#define LM_PREC_THRESH 0.9
+
+// only apply to QCOM
+#define FULL_STRIDE_X 1280
+#define FULL_STRIDE_Y 896
+
+#define CONV_LOCAL_WORKSIZE 16
+
+const int16_t lapl_conv_krnl[9] = {0, 1, 0,
+                                  1, -4, 1,
+                                  0, 1, 0};
+
+void get_lapmap_one(int16_t *lap, uint16_t *res, int x_pitch, int y_pitch);
+bool is_blur(uint16_t *lapmap);
+
+#endif
\ No newline at end of file
diff --git a/selfdrive/camerad/main.cc b/selfdrive/camerad/main.cc
index ccbd9874b8..95b57de19e 100644
--- a/selfdrive/camerad/main.cc
+++ b/selfdrive/camerad/main.cc
@@ -23,6 +23,7 @@
 #include "messaging.hpp"
 
 #include "transforms/rgb_to_yuv.h"
+#include "imgproc/utils.h"
 
 #include "clutil.h"
 #include "bufs.h"
@@ -78,6 +79,14 @@ struct VisionState {
   cl_kernel krnl_debayer_rear;
   cl_kernel krnl_debayer_front;
 
+  cl_program prg_rgb_laplacian;
+  cl_kernel krnl_rgb_laplacian;
+
+  int conv_cl_localMemSize;
+  size_t conv_cl_globalWorkSize[2];
+  size_t conv_cl_localWorkSize[2];
+  size_t pool_cl_globalWorkSize[2];
+
   // processing
   TBuffer ui_tb;
   TBuffer ui_front_tb;
@@ -110,6 +119,8 @@ struct VisionState {
   int rgb_width, rgb_height, rgb_stride;
   VisionBuf rgb_bufs[UI_BUF_COUNT];
   cl_mem rgb_bufs_cl[UI_BUF_COUNT];
+  cl_mem rgb_conv_roi_cl, rgb_conv_result_cl, rgb_conv_filter_cl;
+  uint16_t lapres[(ROI_X_MAX-ROI_X_MIN+1)*(ROI_Y_MAX-ROI_Y_MIN+1)];
 
   size_t rgb_front_buf_size;
   int rgb_front_width, rgb_front_height, rgb_front_stride;
@@ -177,9 +188,9 @@ void* frontview_thread(void *arg) {
     cl_event debayer_event;
     if (s->cameras.front.ci.bayer) {
       err = clSetKernelArg(s->krnl_debayer_front, 0, sizeof(cl_mem), &s->front_camera_bufs_cl[buf_idx]);
-      cl_check_error(err);
+      assert(err == 0);
       err = clSetKernelArg(s->krnl_debayer_front, 1, sizeof(cl_mem), &s->rgb_front_bufs_cl[rgb_idx]);
-      cl_check_error(err);
+      assert(err == 0);
       float digital_gain = 1.0;
       err = clSetKernelArg(s->krnl_debayer_front, 2, sizeof(float), &digital_gain);
       assert(err == 0);
@@ -390,9 +401,9 @@ void* processing_thread(void *arg) {
     cl_event debayer_event;
     if (s->cameras.rear.ci.bayer) {
       err = clSetKernelArg(s->krnl_debayer_rear, 0, sizeof(cl_mem), &s->camera_bufs_cl[buf_idx]);
-      cl_check_error(err);
+      assert(err == 0);
       err = clSetKernelArg(s->krnl_debayer_rear, 1, sizeof(cl_mem), &s->rgb_bufs_cl[rgb_idx]);
-      cl_check_error(err);
+      assert(err == 0);
       err = clSetKernelArg(s->krnl_debayer_rear, 2, sizeof(float), &s->cameras.rear.digital_gain);
       assert(err == 0);
 
@@ -416,6 +427,74 @@ void* processing_thread(void *arg) {
 
     visionbuf_sync(&s->rgb_bufs[rgb_idx], VISIONBUF_SYNC_FROM_DEVICE);
 
+#ifdef QCOM
+    /*FILE *dump_rgb_file = fopen("/tmp/process_dump.rgb", "wb");
+    fwrite(s->rgb_bufs[rgb_idx].addr, s->rgb_bufs[rgb_idx].len, sizeof(uint8_t), dump_rgb_file);
+    fclose(dump_rgb_file);
+    printf("ORIGINAL SAVED!!\n");*/
+
+    /*double t10 = millis_since_boot();*/
+
+    // cache rgb roi and write to cl
+    uint8_t *rgb_roi_buf = new uint8_t[(s->rgb_width/NUM_SEGMENTS_X)*(s->rgb_height/NUM_SEGMENTS_Y)*3];
+    int roi_id = cnt % ((ROI_X_MAX-ROI_X_MIN+1)*(ROI_Y_MAX-ROI_Y_MIN+1)); // rolling roi
+    int roi_x_offset = roi_id % (ROI_X_MAX-ROI_X_MIN+1);
+    int roi_y_offset = roi_id / (ROI_X_MAX-ROI_X_MIN+1);
+
+    for (int r=0;r<(s->rgb_height/NUM_SEGMENTS_Y);r++) {
+      memcpy(rgb_roi_buf + r * (s->rgb_width/NUM_SEGMENTS_X) * 3,
+              (uint8_t *) s->rgb_bufs[rgb_idx].addr + \
+                (ROI_Y_MIN + roi_y_offset) * s->rgb_height/NUM_SEGMENTS_Y * FULL_STRIDE_X * 3 + \
+                (ROI_X_MIN + roi_x_offset) * s->rgb_width/NUM_SEGMENTS_X * 3 + r * FULL_STRIDE_X * 3,
+              s->rgb_width/NUM_SEGMENTS_X * 3);
+    }
+
+    err = clEnqueueWriteBuffer (q, s->rgb_conv_roi_cl, true, 0,
+        s->rgb_width/NUM_SEGMENTS_X * s->rgb_height/NUM_SEGMENTS_Y * 3 * sizeof(uint8_t), rgb_roi_buf, 0, 0, 0);
+    assert(err == 0);
+
+    /*double t11 = millis_since_boot();
+    printf("cache time: %f ms\n", t11 - t10);
+    t10 = millis_since_boot();*/
+
+    err = clSetKernelArg(s->krnl_rgb_laplacian, 0, sizeof(cl_mem), (void *) &s->rgb_conv_roi_cl);
+    assert(err == 0);
+    err = clSetKernelArg(s->krnl_rgb_laplacian, 1, sizeof(cl_mem), (void *) &s->rgb_conv_result_cl);
+    assert(err == 0);
+    err = clSetKernelArg(s->krnl_rgb_laplacian, 2, sizeof(cl_mem), (void *) &s->rgb_conv_filter_cl);
+    assert(err == 0);
+    err = clSetKernelArg(s->krnl_rgb_laplacian, 3, s->conv_cl_localMemSize, 0);
+    assert(err == 0);
+
+    cl_event conv_event;
+    err = clEnqueueNDRangeKernel(q, s->krnl_rgb_laplacian, 2, NULL,
+                                   s->conv_cl_globalWorkSize, s->conv_cl_localWorkSize, 0, 0, &conv_event);
+    assert(err == 0);
+    clWaitForEvents(1, &conv_event);
+    clReleaseEvent(conv_event);
+
+    int16_t *conv_result = new int16_t[(s->rgb_width/NUM_SEGMENTS_X)*(s->rgb_height/NUM_SEGMENTS_Y)];
+    err = clEnqueueReadBuffer(q, s->rgb_conv_result_cl, true, 0,
+       s->rgb_width/NUM_SEGMENTS_X * s->rgb_height/NUM_SEGMENTS_Y * sizeof(int16_t), conv_result, 0, 0, 0);
+    assert(err == 0);
+
+    /*t11 = millis_since_boot();
+    printf("conv time: %f ms\n", t11 - t10);
+    t10 = millis_since_boot();*/
+
+    get_lapmap_one(conv_result, &s->lapres[roi_id], s->rgb_width/NUM_SEGMENTS_X, s->rgb_height/NUM_SEGMENTS_Y);
+
+    /*t11 = millis_since_boot();
+    printf("pool time: %f ms\n", t11 - t10);
+    t10 = millis_since_boot();*/
+
+    delete [] rgb_roi_buf;
+    delete [] conv_result;
+
+    /*t11 = millis_since_boot();
+    printf("process time: %f ms\n ----- \n", t11 - t10);
+    t10 = millis_since_boot();*/
+#endif
 
     double t2 = millis_since_boot();
 
@@ -465,6 +544,8 @@ void* processing_thread(void *arg) {
         kj::ArrayPtr<const uint8_t> focus_confs(&s->cameras.rear.confidence[0], NUM_FOCUS);
         framed.setFocusVal(focus_vals);
         framed.setFocusConf(focus_confs);
+        kj::ArrayPtr<const uint16_t> sharpness_score(&s->lapres[0], (ROI_X_MAX-ROI_X_MIN+1)*(ROI_Y_MAX-ROI_Y_MIN+1));
+        framed.setSharpnessScore(sharpness_score);
 #endif
 
 // TODO: add this back
@@ -907,6 +988,34 @@ cl_program build_debayer_program(VisionState *s,
   return CLU_LOAD_FROM_FILE(s->context, s->device_id, "cameras/debayer.cl", args);
 }
 
+cl_program build_conv_program(VisionState *s,
+                              int image_w, int image_h,
+                              int filter_size) {
+  char args[4096];
+  snprintf(args, sizeof(args),
+          "-cl-fast-relaxed-math -cl-denorms-are-zero "
+          "-DIMAGE_W=%d -DIMAGE_H=%d -DFLIP_RB=%d "
+          "-DFILTER_SIZE=%d -DHALF_FILTER_SIZE=%d -DTWICE_HALF_FILTER_SIZE=%d -DHALF_FILTER_SIZE_IMAGE_W=%d",
+          image_w, image_h, 1,
+          filter_size, filter_size/2, (filter_size/2)*2, (filter_size/2)*image_w);
+  return CLU_LOAD_FROM_FILE(s->context, s->device_id, "imgproc/conv.cl", args);
+}
+
+cl_program build_pool_program(VisionState *s,
+                              int full_stride_x,
+                              int x_pitch, int y_pitch,
+                              int roi_x_min, int roi_x_max,
+                              int roi_y_min, int roi_y_max) {
+  char args[4096];
+  snprintf(args, sizeof(args),
+          "-cl-fast-relaxed-math -cl-denorms-are-zero "
+          "-DFULL_STRIDE_X=%d -DX_PITCH=%d -DY_PITCH=%d "
+          "-DROI_X_MIN=%d -DROI_X_MAX=%d -DROI_Y_MIN=%d -DROI_Y_MAX=%d",
+          full_stride_x, x_pitch, y_pitch,
+          roi_x_min, roi_x_max, roi_y_min, roi_y_max);
+  return CLU_LOAD_FROM_FILE(s->context, s->device_id, "imgproc/pool.cl", args);
+}
+
 void cl_init(VisionState *s) {
   int err;
   cl_platform_id platform_id = NULL;
@@ -1049,6 +1158,25 @@ void init_buffers(VisionState *s) {
     assert(err == 0);
   }
 
+  s->prg_rgb_laplacian = build_conv_program(s, s->rgb_width/NUM_SEGMENTS_X, s->rgb_height/NUM_SEGMENTS_Y, 
+                                            3);
+  s->krnl_rgb_laplacian = clCreateKernel(s->prg_rgb_laplacian, "rgb2gray_conv2d", &err);
+  assert(err == 0);
+  s->rgb_conv_roi_cl = clCreateBuffer(s->context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER,
+      s->rgb_width/NUM_SEGMENTS_X * s->rgb_height/NUM_SEGMENTS_Y * 3 * sizeof(uint8_t), NULL, NULL);
+  s->rgb_conv_result_cl = clCreateBuffer(s->context, CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER,
+      s->rgb_width/NUM_SEGMENTS_X * s->rgb_height/NUM_SEGMENTS_Y * sizeof(int16_t), NULL, NULL);
+  s->rgb_conv_filter_cl = clCreateBuffer(s->context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
+      9 * sizeof(int16_t), (void*)&lapl_conv_krnl, NULL);
+  s->conv_cl_localMemSize = ( CONV_LOCAL_WORKSIZE + 2 * (3 / 2) ) * ( CONV_LOCAL_WORKSIZE + 2 * (3 / 2) );
+  s->conv_cl_localMemSize *= 3 * sizeof(uint8_t);
+  s->conv_cl_globalWorkSize[0] = s->rgb_width/NUM_SEGMENTS_X;
+  s->conv_cl_globalWorkSize[1] = s->rgb_height/NUM_SEGMENTS_Y;
+  s->conv_cl_localWorkSize[0] = CONV_LOCAL_WORKSIZE;
+  s->conv_cl_localWorkSize[1] = CONV_LOCAL_WORKSIZE;
+
+  for (int i=0; i<(ROI_X_MAX-ROI_X_MIN+1)*(ROI_Y_MAX-ROI_Y_MIN+1); i++) {s->lapres[i] = 16160;}
+
   rgb_to_yuv_init(&s->rgb_to_yuv_state, s->context, s->device_id, s->yuv_width, s->yuv_height, s->rgb_stride);
   rgb_to_yuv_init(&s->front_rgb_to_yuv_state, s->context, s->device_id, s->yuv_front_width, s->yuv_front_height, s->rgb_front_stride);
 }