openpilot_comma/selfdrive/modeld/thneed/optimizer.cc

#include <map>
#include <string>
#include <string.h>
#include <assert.h>
#include "thneed.h"

#include "common/util.h"
#include "common/clutil.h"

extern map<cl_program, string> g_program_source;

/*static int is_same_size_image(cl_mem a, cl_mem b) {
  size_t a_width, a_height, a_depth, a_array_size, a_row_pitch, a_slice_pitch;
  clGetImageInfo(a, CL_IMAGE_WIDTH, sizeof(a_width), &a_width, NULL);
  clGetImageInfo(a, CL_IMAGE_HEIGHT, sizeof(a_height), &a_height, NULL);
  clGetImageInfo(a, CL_IMAGE_DEPTH, sizeof(a_depth), &a_depth, NULL);
  clGetImageInfo(a, CL_IMAGE_ARRAY_SIZE, sizeof(a_array_size), &a_array_size, NULL);
  clGetImageInfo(a, CL_IMAGE_ROW_PITCH, sizeof(a_row_pitch), &a_row_pitch, NULL);
  clGetImageInfo(a, CL_IMAGE_SLICE_PITCH, sizeof(a_slice_pitch), &a_slice_pitch, NULL);

  size_t b_width, b_height, b_depth, b_array_size, b_row_pitch, b_slice_pitch;
  clGetImageInfo(b, CL_IMAGE_WIDTH, sizeof(b_width), &b_width, NULL);
  clGetImageInfo(b, CL_IMAGE_HEIGHT, sizeof(b_height), &b_height, NULL);
  clGetImageInfo(b, CL_IMAGE_DEPTH, sizeof(b_depth), &b_depth, NULL);
  clGetImageInfo(b, CL_IMAGE_ARRAY_SIZE, sizeof(b_array_size), &b_array_size, NULL);
  clGetImageInfo(b, CL_IMAGE_ROW_PITCH, sizeof(b_row_pitch), &b_row_pitch, NULL);
  clGetImageInfo(b, CL_IMAGE_SLICE_PITCH, sizeof(b_slice_pitch), &b_slice_pitch, NULL);

  return (a_width == b_width) && (a_height == b_height) &&
    (a_depth == b_depth) && (a_array_size == b_array_size) &&
    (a_row_pitch == b_row_pitch) && (a_slice_pitch == b_slice_pitch);
}*/

static cl_mem make_image_like(cl_context context, cl_mem val) {
  cl_image_format format;
  size_t width, height, row_pitch;
  clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);
  assert(format.image_channel_order == CL_RGBA);
  assert(format.image_channel_data_type == CL_HALF_FLOAT);
  clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);
  clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);
  clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);

  cl_image_desc desc = {0};
  desc.image_type = CL_MEM_OBJECT_IMAGE2D;
  desc.image_width = width;
  desc.image_height = height;
  desc.image_row_pitch = row_pitch;

  cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE, row_pitch*height, NULL, NULL);
  assert(buf != NULL);
  desc.buffer = buf;

  cl_int err;
  cl_mem tmp = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);
  //printf("got %d for image %zux%zu %zu\n", err, width, height, row_pitch);
  assert(tmp != NULL);

  return tmp;
}

// convolution_horizontal_reduced_reads_1x1 is 66% of the model runtime
// make that faster and the model gets faster

// this cuts ~2 ms off the model runtime right now
int Thneed::optimize() {
  const char *kernel_path = getenv("KERNEL_PATH");
  if (!kernel_path) { kernel_path = "/data/openpilot/selfdrive/modeld/thneed/kernels"; printf("no KERNEL_PATH set, defaulting to %s\n", kernel_path); }

  string convolution_;
  {
    char fn[0x100];
    snprintf(fn, sizeof(fn), "%s/%s.cl", kernel_path, "convolution_");
    convolution_ = util::read_file(fn);
  }

  // load custom kernels
  map<string, cl_program> g_programs;
  for (auto &k : kq) {
    // replace program?
    if (g_programs.find(k->name) == g_programs.end()) {
      char fn[0x100];
      snprintf(fn, sizeof(fn), "%s/%s.cl", kernel_path, k->name.c_str());
      if (util::file_exists(fn)) {
        string kernel_src = util::read_file(fn);
        if (k->name.rfind("convolution_", 0) == 0) {
          kernel_src += convolution_;
        }
        printf("building kernel %s with len %lu\n", k->name.c_str(), kernel_src.length());
        k->program = cl_program_from_source(context, device_id, kernel_src);

        // save in cache
        g_programs[k->name] = k->program;
        g_program_source[k->program] = kernel_src;
      } else {
        g_programs[k->name] = NULL;
      }
    } else {
      // cached replacement
      if (g_programs[k->name] != NULL) {
        k->program = g_programs[k->name];
      }
    }

    // hack in accumulator to convolution_horizontal_reduced_reads_1x1
    if (k->name == "convolution_horizontal_reduced_reads_1x1") {
      k->arg_names.push_back("doAccumulate");
      short doAccumulate = 0;
      k->args.push_back(string((char *)&doAccumulate, sizeof(doAccumulate)));
      k->args_size.push_back(2);
      k->arg_names.push_back("accumulator");
      k->args.push_back(k->args[k->get_arg_num("output")]);
      k->args_size.push_back(8);
      k->num_args += 2;
    }

    // assert that parameters + batchNormBiases are not used
    // since they aren't supported in custom replacement kernels
    if (k->name == "convolution_horizontal_reduced_reads_1x1" ||
        k->name == "convolution_horizontal_reduced_reads" ||
        k->name == "convolution_horizontal_reduced_reads_5_outputs") {
      string p1 = k->args[k->get_arg_num("parameters")];
      string p2 = k->args[k->get_arg_num("batchNormBiases")];
      assert(p1.length() == 8 && *((uint64_t*)p1.data()) == 0);
      assert(p2.length() == 8 && *((uint64_t*)p2.data()) == 0);
    }
  }

  // optimizer
  size_t start_size;
  do {
    start_size = kq.size();

    // get optimizations
    map<string, string> replacements;
    for (int i = 0; i < kq.size(); i++) {
      // fusing elementwise_sum + activate_image will save 3 enqueues

      // delete useless copy layers
      // saves ~0.7 ms
      /*if (kq[i]->name == "concatenation" || kq[i]->name == "flatten") {
        string in = kq[i]->args[kq[i]->get_arg_num("input")];
        string out = kq[i]->args[kq[i]->get_arg_num("output")];
        if (is_same_size_image(*(cl_mem*)in.data(), *(cl_mem*)out.data())) {
          cl_mem tmp = make_image_like(context, *(cl_mem *)in.data());
          replacements[in] = string((char *)&tmp, sizeof(tmp));
          replacements[out] = string((char *)&tmp, sizeof(tmp));

          kq.erase(kq.begin()+i); --i;
        }
      }*/

      // NOTE: if activations/accumulation are done in the wrong order, this will be wrong

      // fuse activations into convs and fc_Wtx
      // saves ~1.5 ms
      // NOTE: this changes the outputs because of rounding, should be better now!
      if (i != 0 && kq[i]->name == "activate_image") {
        if (kq[i-1]->name == "convolution_horizontal_reduced_reads_1x1" ||
            kq[i-1]->name == "convolution_horizontal_reduced_reads_5_outputs" ||
            kq[i-1]->name == "convolution_horizontal_reduced_reads" ||
            kq[i-1]->name == "convolution_horizontal_reduced_reads_depthwise" ||
            kq[i-1]->name == "convolution_horizontal_reduced_reads_depthwise_stride_1" ||
            kq[i-1]->name == "fc_Wtx") {
          string lastout = kq[i-1]->args[kq[i-1]->get_arg_num("output")];
          string in = kq[i]->args[kq[i]->get_arg_num("input")];
          string out = kq[i]->args[kq[i]->get_arg_num("output")];

          if (lastout == in) {
            short neuron = *(int*)kq[i]->args[kq[i]->get_arg_num("neuron")].data();
            assert(neuron <= 5);

            // ELU isn't supported in fc_Wtx
            assert(!(kq[i-1]->name == "fc_Wtx" && neuron == 5));

            kq[i-1]->args[kq[i-1]->get_arg_num("neuron")] = string((char *)&neuron, sizeof(neuron));

            cl_mem tmp = make_image_like(context, *(cl_mem *)lastout.data());
            replacements[in] = string((char *)&tmp, sizeof(tmp));
            replacements[out] = string((char *)&tmp, sizeof(tmp));

            kq.erase(kq.begin()+i); --i;
          }
        }
      }

      // fuse accumulation into convs and fc_Wtx
      if (i != 0 && kq[i]->name == "elementwise_sum") {
        if (kq[i-1]->name == "convolution_horizontal_reduced_reads_1x1" ||
            kq[i-1]->name == "fc_Wtx") {
          string lastout = kq[i-1]->args[kq[i-1]->get_arg_num("output")];
          string a = kq[i]->args[kq[i]->get_arg_num("a")];
          string b = kq[i]->args[kq[i]->get_arg_num("b")];
          string out = kq[i]->args[kq[i]->get_arg_num("output")];

          if (lastout == a) {
            kq[i-1]->args[kq[i-1]->get_arg_num("accumulator")] = b;
          } else if (lastout == b) {
            kq[i-1]->args[kq[i-1]->get_arg_num("accumulator")] = a;
          } else {
            continue;
          }

          cl_mem tmp = make_image_like(context, *(cl_mem *)lastout.data());
          replacements[lastout] = string((char *)&tmp, sizeof(tmp));
          replacements[out] = string((char *)&tmp, sizeof(tmp));

          short doAccumulate = 1;
          kq[i-1]->args[kq[i-1]->get_arg_num("doAccumulate")] = string((char *)&doAccumulate, sizeof(doAccumulate));

          kq.erase(kq.begin()+i); --i;
        }
      }
    }

    // remap inputs and outputs, and clear the kernels
    for (int i = 0; i < kq.size(); i++) {
      kq[i]->kernel = NULL;
      for (int j = 0; j < kq[i]->num_args; j++) {
        if (replacements.find(kq[i]->args[j]) != replacements.end()) {
          kq[i]->args[j] = replacements[kq[i]->args[j]];
        }
      }
    }

    printf("optimize %lu -> %lu\n", start_size, kq.size());
  } while (kq.size() != start_size);

  size_t work_group_size = 0;
  clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(work_group_size), &work_group_size, NULL);
  printf("max work group size %lu\n", work_group_size);

  // local work group optimizer
  for (auto &k : kq) {
    // only do it for convs, since others might share memory
    if (k->name.rfind("convolution_", 0) == 0) {
      int best = -1;
      if (k->local_work_size[0] * k->local_work_size[1] * k->local_work_size[2] < work_group_size/2) {
        uint64_t base_time = k->benchmark();
        uint64_t best_time = base_time;
        for (int i = 0; i < 3; i++) {
          k->local_work_size[i] *= 2;
          uint64_t this_time = k->benchmark();
          if (this_time < best_time) {
            best = i;
            best_time = this_time;
          }
          k->local_work_size[i] /= 2;
        }
        if (best != -1) {
          k->local_work_size[best] *= 2;
          //printf("%s %.2f ms doubled %d to %.2f ms\n", k->name.c_str(), base_time/1e6, best, best_time/1e6);
        }
      }

    }
  }

  return 0;
}
add thneed optimizer (#23772) * add thneed optimizer * local work group opt * kernels and final mods * release files * build system touchups * fix kernel path, rand inputs for self test * broken since extra is gone * update model replay ref Co-authored-by: Comma Device <device@comma.ai> old-commit-hash: 90beaebefba444aacf16054d3b23847b26f05c47 4 years ago			`#include <map>`
			`#include <string>`
			`#include <string.h>`
			`#include <assert.h>`
			`#include "thneed.h"`

Merge common/ and selfdrive/common (#24556) * Merge common/ and selfdrive/common * fix that * fix version * fix unit tests old-commit-hash: cb8885cffb313bea258c012c3026461a22bf8135 4 years ago			`#include "common/util.h"`
			`#include "common/clutil.h"`
minor cleanups, fix non binary compile (#23882) Co-authored-by: Comma Device <device@comma.ai> old-commit-hash: 825c924c1cf63cc915f94e9e46cf0e3228451e18 4 years ago
add thneed optimizer (#23772) * add thneed optimizer * local work group opt * kernels and final mods * release files * build system touchups * fix kernel path, rand inputs for self test * broken since extra is gone * update model replay ref Co-authored-by: Comma Device <device@comma.ai> old-commit-hash: 90beaebefba444aacf16054d3b23847b26f05c47 4 years ago			`extern map<cl_program, string> g_program_source;`

New model: minor fixes (#25368) * 1456d261-d232-4654-8885-4d9fde883894/440 ac1a6744-85b0-4ec6-8ba7-608d0717b8f1/750 * some copies are useful * update model replay ref * less frames in model replay onnx cpu Co-authored-by: Bruce Wayne <yassine@comma.ai> Co-authored-by: Yassine Yousfi <yyousfi1@binghamton.edu> old-commit-hash: 55bf9385047ec876643cdebc197fba4f05cc4695 3 years ago			`/*static int is_same_size_image(cl_mem a, cl_mem b) {`
add thneed optimizer (#23772) * add thneed optimizer * local work group opt * kernels and final mods * release files * build system touchups * fix kernel path, rand inputs for self test * broken since extra is gone * update model replay ref Co-authored-by: Comma Device <device@comma.ai> old-commit-hash: 90beaebefba444aacf16054d3b23847b26f05c47 4 years ago			`size_t a_width, a_height, a_depth, a_array_size, a_row_pitch, a_slice_pitch;`
			`clGetImageInfo(a, CL_IMAGE_WIDTH, sizeof(a_width), &a_width, NULL);`
			`clGetImageInfo(a, CL_IMAGE_HEIGHT, sizeof(a_height), &a_height, NULL);`
			`clGetImageInfo(a, CL_IMAGE_DEPTH, sizeof(a_depth), &a_depth, NULL);`
			`clGetImageInfo(a, CL_IMAGE_ARRAY_SIZE, sizeof(a_array_size), &a_array_size, NULL);`
			`clGetImageInfo(a, CL_IMAGE_ROW_PITCH, sizeof(a_row_pitch), &a_row_pitch, NULL);`
			`clGetImageInfo(a, CL_IMAGE_SLICE_PITCH, sizeof(a_slice_pitch), &a_slice_pitch, NULL);`

			`size_t b_width, b_height, b_depth, b_array_size, b_row_pitch, b_slice_pitch;`
			`clGetImageInfo(b, CL_IMAGE_WIDTH, sizeof(b_width), &b_width, NULL);`
			`clGetImageInfo(b, CL_IMAGE_HEIGHT, sizeof(b_height), &b_height, NULL);`
			`clGetImageInfo(b, CL_IMAGE_DEPTH, sizeof(b_depth), &b_depth, NULL);`
			`clGetImageInfo(b, CL_IMAGE_ARRAY_SIZE, sizeof(b_array_size), &b_array_size, NULL);`
			`clGetImageInfo(b, CL_IMAGE_ROW_PITCH, sizeof(b_row_pitch), &b_row_pitch, NULL);`
			`clGetImageInfo(b, CL_IMAGE_SLICE_PITCH, sizeof(b_slice_pitch), &b_slice_pitch, NULL);`

			`return (a_width == b_width) && (a_height == b_height) &&`
			`(a_depth == b_depth) && (a_array_size == b_array_size) &&`
			`(a_row_pitch == b_row_pitch) && (a_slice_pitch == b_slice_pitch);`
New model: minor fixes (#25368) * 1456d261-d232-4654-8885-4d9fde883894/440 ac1a6744-85b0-4ec6-8ba7-608d0717b8f1/750 * some copies are useful * update model replay ref * less frames in model replay onnx cpu Co-authored-by: Bruce Wayne <yassine@comma.ai> Co-authored-by: Yassine Yousfi <yyousfi1@binghamton.edu> old-commit-hash: 55bf9385047ec876643cdebc197fba4f05cc4695 3 years ago			`}*/`
add thneed optimizer (#23772) * add thneed optimizer * local work group opt * kernels and final mods * release files * build system touchups * fix kernel path, rand inputs for self test * broken since extra is gone * update model replay ref Co-authored-by: Comma Device <device@comma.ai> old-commit-hash: 90beaebefba444aacf16054d3b23847b26f05c47 4 years ago
			`static cl_mem make_image_like(cl_context context, cl_mem val) {`
			`cl_image_format format;`
			`size_t width, height, row_pitch;`
			`clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL);`
			`assert(format.image_channel_order == CL_RGBA);`
			`assert(format.image_channel_data_type == CL_HALF_FLOAT);`
			`clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL);`
			`clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL);`
			`clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL);`

			`cl_image_desc desc = {0};`
			`desc.image_type = CL_MEM_OBJECT_IMAGE2D;`
			`desc.image_width = width;`
			`desc.image_height = height;`
			`desc.image_row_pitch = row_pitch;`

			`cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE, row_pitch*height, NULL, NULL);`
			`assert(buf != NULL);`
			`desc.buffer = buf;`

			`cl_int err;`
			`cl_mem tmp = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err);`
			`//printf("got %d for image %zux%zu %zu\n", err, width, height, row_pitch);`
			`assert(tmp != NULL);`

			`return tmp;`
			`}`

			`// convolution_horizontal_reduced_reads_1x1 is 66% of the model runtime`
			`// make that faster and the model gets faster`

			`// this cuts ~2 ms off the model runtime right now`
			`int Thneed::optimize() {`
			`const char *kernel_path = getenv("KERNEL_PATH");`
			`if (!kernel_path) { kernel_path = "/data/openpilot/selfdrive/modeld/thneed/kernels"; printf("no KERNEL_PATH set, defaulting to %s\n", kernel_path); }`
minor cleanups, fix non binary compile (#23882) Co-authored-by: Comma Device <device@comma.ai> old-commit-hash: 825c924c1cf63cc915f94e9e46cf0e3228451e18 4 years ago
			`string convolution_;`
			`{`
			`char fn[0x100];`
			`snprintf(fn, sizeof(fn), "%s/%s.cl", kernel_path, "convolution_");`
			`convolution_ = util::read_file(fn);`
			`}`

add thneed optimizer (#23772) * add thneed optimizer * local work group opt * kernels and final mods * release files * build system touchups * fix kernel path, rand inputs for self test * broken since extra is gone * update model replay ref Co-authored-by: Comma Device <device@comma.ai> old-commit-hash: 90beaebefba444aacf16054d3b23847b26f05c47 4 years ago			`// load custom kernels`
			`map<string, cl_program> g_programs;`
			`for (auto &k : kq) {`
			`// replace program?`
			`if (g_programs.find(k->name) == g_programs.end()) {`
			`char fn[0x100];`
			`snprintf(fn, sizeof(fn), "%s/%s.cl", kernel_path, k->name.c_str());`
minor cleanups, fix non binary compile (#23882) Co-authored-by: Comma Device <device@comma.ai> old-commit-hash: 825c924c1cf63cc915f94e9e46cf0e3228451e18 4 years ago			`if (util::file_exists(fn)) {`
			`string kernel_src = util::read_file(fn);`
			`if (k->name.rfind("convolution_", 0) == 0) {`
			`kernel_src += convolution_;`
add thneed optimizer (#23772) * add thneed optimizer * local work group opt * kernels and final mods * release files * build system touchups * fix kernel path, rand inputs for self test * broken since extra is gone * update model replay ref Co-authored-by: Comma Device <device@comma.ai> old-commit-hash: 90beaebefba444aacf16054d3b23847b26f05c47 4 years ago			`}`
minor cleanups, fix non binary compile (#23882) Co-authored-by: Comma Device <device@comma.ai> old-commit-hash: 825c924c1cf63cc915f94e9e46cf0e3228451e18 4 years ago			`printf("building kernel %s with len %lu\n", k->name.c_str(), kernel_src.length());`
			`k->program = cl_program_from_source(context, device_id, kernel_src);`
add thneed optimizer (#23772) * add thneed optimizer * local work group opt * kernels and final mods * release files * build system touchups * fix kernel path, rand inputs for self test * broken since extra is gone * update model replay ref Co-authored-by: Comma Device <device@comma.ai> old-commit-hash: 90beaebefba444aacf16054d3b23847b26f05c47 4 years ago
			`// save in cache`
			`g_programs[k->name] = k->program;`
minor cleanups, fix non binary compile (#23882) Co-authored-by: Comma Device <device@comma.ai> old-commit-hash: 825c924c1cf63cc915f94e9e46cf0e3228451e18 4 years ago			`g_program_source[k->program] = kernel_src;`
add thneed optimizer (#23772) * add thneed optimizer * local work group opt * kernels and final mods * release files * build system touchups * fix kernel path, rand inputs for self test * broken since extra is gone * update model replay ref Co-authored-by: Comma Device <device@comma.ai> old-commit-hash: 90beaebefba444aacf16054d3b23847b26f05c47 4 years ago			`} else {`
			`g_programs[k->name] = NULL;`
			`}`
			`} else {`
			`// cached replacement`
			`if (g_programs[k->name] != NULL) {`
			`k->program = g_programs[k->name];`
			`}`
			`}`

			`// hack in accumulator to convolution_horizontal_reduced_reads_1x1`
			`if (k->name == "convolution_horizontal_reduced_reads_1x1") {`
			`k->arg_names.push_back("doAccumulate");`
			`short doAccumulate = 0;`
			`k->args.push_back(string((char *)&doAccumulate, sizeof(doAccumulate)));`
			`k->args_size.push_back(2);`
			`k->arg_names.push_back("accumulator");`
			`k->args.push_back(k->args[k->get_arg_num("output")]);`
			`k->args_size.push_back(8);`
			`k->num_args += 2;`
			`}`

			`// assert that parameters + batchNormBiases are not used`
			`// since they aren't supported in custom replacement kernels`
			`if (k->name == "convolution_horizontal_reduced_reads_1x1" \|\|`
			`k->name == "convolution_horizontal_reduced_reads" \|\|`
			`k->name == "convolution_horizontal_reduced_reads_5_outputs") {`
			`string p1 = k->args[k->get_arg_num("parameters")];`
			`string p2 = k->args[k->get_arg_num("batchNormBiases")];`
			`assert(p1.length() == 8 && ((uint64_t)p1.data()) == 0);`
			`assert(p2.length() == 8 && ((uint64_t)p2.data()) == 0);`
			`}`
			`}`

			`// optimizer`
			`size_t start_size;`
			`do {`
			`start_size = kq.size();`

			`// get optimizations`
			`map<string, string> replacements;`
			`for (int i = 0; i < kq.size(); i++) {`
			`// fusing elementwise_sum + activate_image will save 3 enqueues`

			`// delete useless copy layers`
			`// saves ~0.7 ms`
New model: minor fixes (#25368) * 1456d261-d232-4654-8885-4d9fde883894/440 ac1a6744-85b0-4ec6-8ba7-608d0717b8f1/750 * some copies are useful * update model replay ref * less frames in model replay onnx cpu Co-authored-by: Bruce Wayne <yassine@comma.ai> Co-authored-by: Yassine Yousfi <yyousfi1@binghamton.edu> old-commit-hash: 55bf9385047ec876643cdebc197fba4f05cc4695 3 years ago			`/*if (kq[i]->name == "concatenation" \|\| kq[i]->name == "flatten") {`
add thneed optimizer (#23772) * add thneed optimizer * local work group opt * kernels and final mods * release files * build system touchups * fix kernel path, rand inputs for self test * broken since extra is gone * update model replay ref Co-authored-by: Comma Device <device@comma.ai> old-commit-hash: 90beaebefba444aacf16054d3b23847b26f05c47 4 years ago			`string in = kq[i]->args[kq[i]->get_arg_num("input")];`
			`string out = kq[i]->args[kq[i]->get_arg_num("output")];`
			`if (is_same_size_image((cl_mem)in.data(), (cl_mem)out.data())) {`
			`cl_mem tmp = make_image_like(context, (cl_mem )in.data());`
			`replacements[in] = string((char *)&tmp, sizeof(tmp));`
			`replacements[out] = string((char *)&tmp, sizeof(tmp));`

			`kq.erase(kq.begin()+i); --i;`
			`}`
New model: minor fixes (#25368) * 1456d261-d232-4654-8885-4d9fde883894/440 ac1a6744-85b0-4ec6-8ba7-608d0717b8f1/750 * some copies are useful * update model replay ref * less frames in model replay onnx cpu Co-authored-by: Bruce Wayne <yassine@comma.ai> Co-authored-by: Yassine Yousfi <yyousfi1@binghamton.edu> old-commit-hash: 55bf9385047ec876643cdebc197fba4f05cc4695 3 years ago			`}*/`
add thneed optimizer (#23772) * add thneed optimizer * local work group opt * kernels and final mods * release files * build system touchups * fix kernel path, rand inputs for self test * broken since extra is gone * update model replay ref Co-authored-by: Comma Device <device@comma.ai> old-commit-hash: 90beaebefba444aacf16054d3b23847b26f05c47 4 years ago
			`// NOTE: if activations/accumulation are done in the wrong order, this will be wrong`

			`// fuse activations into convs and fc_Wtx`
			`// saves ~1.5 ms`
			`// NOTE: this changes the outputs because of rounding, should be better now!`
			`if (i != 0 && kq[i]->name == "activate_image") {`
			`if (kq[i-1]->name == "convolution_horizontal_reduced_reads_1x1" \|\|`
			`kq[i-1]->name == "convolution_horizontal_reduced_reads_5_outputs" \|\|`
			`kq[i-1]->name == "convolution_horizontal_reduced_reads" \|\|`
			`kq[i-1]->name == "convolution_horizontal_reduced_reads_depthwise" \|\|`
			`kq[i-1]->name == "convolution_horizontal_reduced_reads_depthwise_stride_1" \|\|`
			`kq[i-1]->name == "fc_Wtx") {`
			`string lastout = kq[i-1]->args[kq[i-1]->get_arg_num("output")];`
			`string in = kq[i]->args[kq[i]->get_arg_num("input")];`
			`string out = kq[i]->args[kq[i]->get_arg_num("output")];`

			`if (lastout == in) {`
			`short neuron = (int)kq[i]->args[kq[i]->get_arg_num("neuron")].data();`
thneed: a few asserts in the optimizer for cases it will fail old-commit-hash: 2b652605a08178e539d7ff7ab7e009b4da29bd46 4 years ago			`assert(neuron <= 5);`

			`// ELU isn't supported in fc_Wtx`
			`assert(!(kq[i-1]->name == "fc_Wtx" && neuron == 5));`

add thneed optimizer (#23772) * add thneed optimizer * local work group opt * kernels and final mods * release files * build system touchups * fix kernel path, rand inputs for self test * broken since extra is gone * update model replay ref Co-authored-by: Comma Device <device@comma.ai> old-commit-hash: 90beaebefba444aacf16054d3b23847b26f05c47 4 years ago			`kq[i-1]->args[kq[i-1]->get_arg_num("neuron")] = string((char *)&neuron, sizeof(neuron));`

			`cl_mem tmp = make_image_like(context, (cl_mem )lastout.data());`
			`replacements[in] = string((char *)&tmp, sizeof(tmp));`
			`replacements[out] = string((char *)&tmp, sizeof(tmp));`

			`kq.erase(kq.begin()+i); --i;`
			`}`
			`}`
			`}`

			`// fuse accumulation into convs and fc_Wtx`
			`if (i != 0 && kq[i]->name == "elementwise_sum") {`
			`if (kq[i-1]->name == "convolution_horizontal_reduced_reads_1x1" \|\|`
			`kq[i-1]->name == "fc_Wtx") {`
			`string lastout = kq[i-1]->args[kq[i-1]->get_arg_num("output")];`
			`string a = kq[i]->args[kq[i]->get_arg_num("a")];`
			`string b = kq[i]->args[kq[i]->get_arg_num("b")];`
			`string out = kq[i]->args[kq[i]->get_arg_num("output")];`

			`if (lastout == a) {`
			`kq[i-1]->args[kq[i-1]->get_arg_num("accumulator")] = b;`
			`} else if (lastout == b) {`
			`kq[i-1]->args[kq[i-1]->get_arg_num("accumulator")] = a;`
			`} else {`
			`continue;`
			`}`

			`cl_mem tmp = make_image_like(context, (cl_mem )lastout.data());`
			`replacements[lastout] = string((char *)&tmp, sizeof(tmp));`
			`replacements[out] = string((char *)&tmp, sizeof(tmp));`

			`short doAccumulate = 1;`
			`kq[i-1]->args[kq[i-1]->get_arg_num("doAccumulate")] = string((char *)&doAccumulate, sizeof(doAccumulate));`

			`kq.erase(kq.begin()+i); --i;`
			`}`
			`}`
			`}`

			`// remap inputs and outputs, and clear the kernels`
			`for (int i = 0; i < kq.size(); i++) {`
			`kq[i]->kernel = NULL;`
			`for (int j = 0; j < kq[i]->num_args; j++) {`
			`if (replacements.find(kq[i]->args[j]) != replacements.end()) {`
			`kq[i]->args[j] = replacements[kq[i]->args[j]];`
			`}`
			`}`
			`}`

			`printf("optimize %lu -> %lu\n", start_size, kq.size());`
			`} while (kq.size() != start_size);`

			`size_t work_group_size = 0;`
			`clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(work_group_size), &work_group_size, NULL);`
			`printf("max work group size %lu\n", work_group_size);`

			`// local work group optimizer`
			`for (auto &k : kq) {`
			`// only do it for convs, since others might share memory`
			`if (k->name.rfind("convolution_", 0) == 0) {`
			`int best = -1;`
			`if (k->local_work_size[0] * k->local_work_size[1] * k->local_work_size[2] < work_group_size/2) {`
			`uint64_t base_time = k->benchmark();`
			`uint64_t best_time = base_time;`
			`for (int i = 0; i < 3; i++) {`
			`k->local_work_size[i] *= 2;`
			`uint64_t this_time = k->benchmark();`
			`if (this_time < best_time) {`
			`best = i;`
			`best_time = this_time;`
			`}`
			`k->local_work_size[i] /= 2;`
			`}`
			`if (best != -1) {`
			`k->local_work_size[best] *= 2;`
			`//printf("%s %.2f ms doubled %d to %.2f ms\n", k->name.c_str(), base_time/1e6, best, best_time/1e6);`
			`}`
			`}`

			`}`
			`}`

			`return 0;`
			`}`