openpilot_comma/selfdrive/modeld/test/dmon_lag/repro.cc

// clang++ -mcpu=cortex-a57 -O2 repro.cc

#include <vector>
#include <time.h>
#include <sched.h>

static inline double millis_since_boot() {
  struct timespec t;
  clock_gettime(CLOCK_BOOTTIME, &t);
  return t.tv_sec * 1000.0 + t.tv_nsec * 1e-6;
}

int set_realtime_priority(int level) {
  long tid = getpid();

  // should match python using chrt
  struct sched_param sa;
  memset(&sa, 0, sizeof(sa));
  sa.sched_priority = level;
  return sched_setscheduler(tid, SCHED_FIFO, &sa);
}

#define MODEL_WIDTH 320
#define MODEL_HEIGHT 640
#define input_lambda(x) (x - 128.f) * 0.0078125f

template <class T>
static inline T *get_buffer(std::vector<T> &buf, const size_t size) {
  if (buf.size() < size) {
    buf.resize(size);
  }
  return buf.data();
}

void inner(uint8_t *resized_buf, float *net_input_buf) {
  int resized_width = MODEL_WIDTH;
  int resized_height = MODEL_HEIGHT;

  // one shot conversion, O(n) anyway
  // yuvframe2tensor, normalize
  for (int r = 0; r < MODEL_HEIGHT/2; r++) {
    for (int c = 0; c < MODEL_WIDTH/2; c++) {
      // Y_ul
      net_input_buf[(c*MODEL_HEIGHT/2) + r] = input_lambda(resized_buf[(2*r*resized_width) + (2*c)]);
      // Y_ur
      net_input_buf[(c*MODEL_HEIGHT/2) + r + (2*(MODEL_WIDTH/2)*(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(2*r*resized_width) + (2*c+1)]);
      // Y_dl
      net_input_buf[(c*MODEL_HEIGHT/2) + r + ((MODEL_WIDTH/2)*(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(2*r*resized_width+1) + (2*c)]);
      // Y_dr
      net_input_buf[(c*MODEL_HEIGHT/2) + r + (3*(MODEL_WIDTH/2)*(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(2*r*resized_width+1) + (2*c+1)]);
      // U
      net_input_buf[(c*MODEL_HEIGHT/2) + r + (4*(MODEL_WIDTH/2)*(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(resized_width*resized_height) + (r*resized_width/2) + c]);
      // V
      net_input_buf[(c*MODEL_HEIGHT/2) + r + (5*(MODEL_WIDTH/2)*(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(resized_width*resized_height) + ((resized_width/2)*(resized_height/2)) + (r*resized_width/2) + c]);
    }
  }
}

float trial() {
  std::vector<uint8_t> vec_resized_buf;
  std::vector<float> vec_net_input_buf;

  int resized_width = MODEL_WIDTH;
  int resized_height = MODEL_HEIGHT;
  uint8_t *resized_buf = get_buffer(vec_resized_buf, resized_width*resized_height*3/2);

  int yuv_buf_len = (MODEL_WIDTH/2) * (MODEL_HEIGHT/2) * 6; // Y|u|v -> y|y|y|y|u|v
  float *net_input_buf = get_buffer(vec_net_input_buf, yuv_buf_len);

  float avg = 0.0;
  for (int i = 0; i < 20; i++) {
    __builtin___clear_cache((char*)resized_buf, (char*)resized_buf + (resized_width*resized_height*3/2));
    __builtin___clear_cache((char*)net_input_buf, (char*)net_input_buf + yuv_buf_len);

    double s4 = millis_since_boot();
    inner(resized_buf, net_input_buf);
    double s5 = millis_since_boot();
    avg += s5-s4;
  }

  avg /= 20;
  if (avg > 5) {
    printf("HIT %f\n", avg);
    printf("BAD\n");

    for (int i = 0; i < 200; i++) {
      __builtin___clear_cache((char*)resized_buf, (char*)resized_buf + (resized_width*resized_height*3/2));
      __builtin___clear_cache((char*)net_input_buf, (char*)net_input_buf + yuv_buf_len);

      double s4 = millis_since_boot();
      inner(resized_buf, net_input_buf);
      double s5 = millis_since_boot();
      printf("%.2f   ", s5-s4);
    }
    printf("\n");

    exit(0);
  }
  return avg;
}

int main() {
  // the realtime priority seems to be what breaks it
  set_realtime_priority(51);

  while (1) {
    float ret = trial();
    printf("got %f\n", ret);
  }
}
reproduce dmon lag 5 years ago			`// clang++ -mcpu=cortex-a57 -O2 repro.cc`

			`#include <vector>`
			`#include <time.h>`
			`#include <sched.h>`

			`static inline double millis_since_boot() {`
			`struct timespec t;`
			`clock_gettime(CLOCK_BOOTTIME, &t);`
			`return t.tv_sec * 1000.0 + t.tv_nsec * 1e-6;`
			`}`

			`int set_realtime_priority(int level) {`
			`long tid = getpid();`

			`// should match python using chrt`
			`struct sched_param sa;`
			`memset(&sa, 0, sizeof(sa));`
			`sa.sched_priority = level;`
			`return sched_setscheduler(tid, SCHED_FIFO, &sa);`
			`}`

			`#define MODEL_WIDTH 320`
			`#define MODEL_HEIGHT 640`
			`#define input_lambda(x) (x - 128.f) * 0.0078125f`

			`template <class T>`
			`static inline T *get_buffer(std::vector<T> &buf, const size_t size) {`
			`if (buf.size() < size) {`
			`buf.resize(size);`
			`}`
			`return buf.data();`
			`}`

			`void inner(uint8_t resized_buf, float net_input_buf) {`
			`int resized_width = MODEL_WIDTH;`
			`int resized_height = MODEL_HEIGHT;`

			`// one shot conversion, O(n) anyway`
			`// yuvframe2tensor, normalize`
			`for (int r = 0; r < MODEL_HEIGHT/2; r++) {`
			`for (int c = 0; c < MODEL_WIDTH/2; c++) {`
			`// Y_ul`
			`net_input_buf[(cMODEL_HEIGHT/2) + r] = input_lambda(resized_buf[(2rresized_width) + (2c)]);`
			`// Y_ur`
			`net_input_buf[(cMODEL_HEIGHT/2) + r + (2(MODEL_WIDTH/2)(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(2rresized_width) + (2c+1)]);`
			`// Y_dl`
			`net_input_buf[(cMODEL_HEIGHT/2) + r + ((MODEL_WIDTH/2)(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(2rresized_width+1) + (2*c)]);`
			`// Y_dr`
			`net_input_buf[(cMODEL_HEIGHT/2) + r + (3(MODEL_WIDTH/2)(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(2rresized_width+1) + (2c+1)]);`
			`// U`
			`net_input_buf[(cMODEL_HEIGHT/2) + r + (4(MODEL_WIDTH/2)(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(resized_widthresized_height) + (r*resized_width/2) + c]);`
			`// V`
			`net_input_buf[(cMODEL_HEIGHT/2) + r + (5(MODEL_WIDTH/2)(MODEL_HEIGHT/2))] = input_lambda(resized_buf[(resized_widthresized_height) + ((resized_width/2)(resized_height/2)) + (rresized_width/2) + c]);`
			`}`
			`}`
			`}`

			`float trial() {`
			`std::vector<uint8_t> vec_resized_buf;`
			`std::vector<float> vec_net_input_buf;`

			`int resized_width = MODEL_WIDTH;`
			`int resized_height = MODEL_HEIGHT;`
			`uint8_t resized_buf = get_buffer(vec_resized_buf, resized_widthresized_height*3/2);`

			`int yuv_buf_len = (MODEL_WIDTH/2) * (MODEL_HEIGHT/2) * 6; // Y\|u\|v -> y\|y\|y\|y\|u\|v`
			`float *net_input_buf = get_buffer(vec_net_input_buf, yuv_buf_len);`

			`float avg = 0.0;`
			`for (int i = 0; i < 20; i++) {`
repros better with cache clears 5 years ago			`__builtin___clear_cache((char)resized_buf, (char)resized_buf + (resized_widthresized_height3/2));`
			`__builtin___clear_cache((char)net_input_buf, (char)net_input_buf + yuv_buf_len);`
reproduce dmon lag 5 years ago
			`double s4 = millis_since_boot();`
			`inner(resized_buf, net_input_buf);`
			`double s5 = millis_since_boot();`
			`avg += s5-s4;`
			`}`

			`avg /= 20;`
			`if (avg > 5) {`
			`printf("HIT %f\n", avg);`
			`printf("BAD\n");`

			`for (int i = 0; i < 200; i++) {`
repros better with cache clears 5 years ago			`__builtin___clear_cache((char)resized_buf, (char)resized_buf + (resized_widthresized_height3/2));`
			`__builtin___clear_cache((char)net_input_buf, (char)net_input_buf + yuv_buf_len);`
reproduce dmon lag 5 years ago
			`double s4 = millis_since_boot();`
			`inner(resized_buf, net_input_buf);`
			`double s5 = millis_since_boot();`
			`printf("%.2f ", s5-s4);`
			`}`
			`printf("\n");`

			`exit(0);`
			`}`
			`return avg;`
			`}`

			`int main() {`
repros better with cache clears 5 years ago			`// the realtime priority seems to be what breaks it`
reproduce dmon lag 5 years ago			`set_realtime_priority(51);`

			`while (1) {`
			`float ret = trial();`
			`printf("got %f\n", ret);`
			`}`
			`}`