openpilot is an open source driver assistance system. openpilot performs the functions of Automated Lane Centering and Adaptive Cruise Control for over 200 supported car makes and models.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

314 lines
9.4 KiB

#include <stdio.h>
#include <stdlib.h>
#include <CL/cl.h>
#include <assert.h>
#include <time.h>
/*
block7b_project_conv (Conv2D) (None, 8, 16, 352) 743424 block7b_activation[0][0]
8448*8*4 = 8*16*2112 = 270336 = input = 128*2112
2112*88*4 = 743424 = weights = 2112*352
1408*8*4 = 8*16*352 = 45056 = output = 128*352
FLOPS = 128*2112*352 = 95158272 = 95 MFLOPS
RAM = 128*2112 + 2112*352 + 128*352 = 1058816 = 1 M accesses
# 22 groups
128*2112 + 2112*16 + 128*16 = 306176
306176*22 = 6735872 real accesses
This is a 128x2112 by 2112x352 matrix multiply
work_size = {88, 4, 8}
Each kernel run computes 16 outputs
0x7f7e8a6380 convolution_horizontal_reduced_reads_1x1 -- 88 4 8 -- 4 4 8
image2d_t input = 0x7f7f490b00 image 8448 x 8 rp 67840
short startPackedInputChannel = 0
short numPackedInputChannelsForGroup = 528
short totalNumPackedInputChannels = 528
short packedOuputChannelOffset = 0
short totalNumPackedOutputChannels = 88
image2d_t weights = 0x7f7f52fb80 image 2112 x 88 rp 16896
float* biases = 0x7f7f564d80 buffer 1408
short filterSizeX = 1
short filterSizeY = 1
image2d_t output = 0x7f7f490e80 image 1408 x 8 rp 11264
short paddingX = 0
short paddingY = 0
short strideX = 1
short strideY = 1
short neuron = 0
float a = 1.000000
float b = 1.000000
float min_clamp = 0.000000
float max_clamp = 0.000000
float* parameters = 0x0
float* batchNormBiases = 0x0
short numOutputColumns = 16
*/
#define GEMM
#define IMAGE
void dump_maps() {
FILE *f = fopen("/proc/self/maps", "rb");
char maps[0x100000];
int len = fread(maps, 1, sizeof(maps), f);
maps[len] = '\0';
maps[0x800] = '\0';
fclose(f);
printf("%s\n", maps);
}
static inline uint64_t nanos_since_boot() {
struct timespec t;
clock_gettime(CLOCK_BOOTTIME, &t);
return t.tv_sec * 1000000000ULL + t.tv_nsec;
}
int main(int argc, char *argv[]) {
cl_int err;
// cl init
cl_device_id device_id;
cl_context context;
cl_command_queue q;
{
cl_platform_id platform_id[2];
cl_uint num_devices;
cl_uint num_platforms;
err = clGetPlatformIDs(sizeof(platform_id)/sizeof(cl_platform_id), platform_id, &num_platforms);
assert(err == 0);
err = clGetDeviceIDs(platform_id[0], CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &num_devices);
assert(err == 0);
context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err);
assert(err == 0);
q = clCreateCommandQueue(context, device_id, 0, &err);
assert(err == 0);
}
printf("cl ready\n");
char tmp[0x10000];
memset(tmp, 0, sizeof(tmp));
FILE *f = fopen(argv[1], "rb");
fread(tmp, 1, sizeof(tmp), f);
fclose(f);
const char *strings[1];
size_t lengths[1];
strings[0] = tmp;
lengths[0] = strlen(tmp);
cl_program prog = clCreateProgramWithSource(context, 1, strings, lengths, &err);
assert(err == 0);
printf("creating program\n");
err = clBuildProgram(prog, 1, &device_id, "-D AVANTE_IS_GPU_A530_64", NULL, NULL);
if (err != 0) {
printf("got err %d\n", err);
size_t length;
char buffer[2048];
clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &length);
buffer[length] = '\0';
printf("%s\n", buffer);
}
assert(err == 0);
printf("built program\n");
#ifdef GEMM
// 128x2112 by 2112x352
int M,N,K;
M = N = K = 1024;
//M = 128; K = 2112; N = 352;
cl_kernel kern = clCreateKernel(prog, "gemm", &err);
assert(err == 0);
printf("creating kernel %p\n", kern);
cl_mem A,B,C;
A = clCreateBuffer(context, CL_MEM_READ_WRITE, M*K*2, NULL, &err);
assert(err == 0);
B = clCreateBuffer(context, CL_MEM_READ_WRITE, K*N*2, NULL, &err);
assert(err == 0);
C = clCreateBuffer(context, CL_MEM_READ_WRITE, M*N*2, NULL, &err);
assert(err == 0);
printf("created buffers\n");
#ifdef IMAGE
cl_image_format fmt;
fmt.image_channel_order = CL_RGBA;
fmt.image_channel_data_type = CL_HALF_FLOAT;
cl_image_desc desc;
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
desc.image_depth = 0; desc.image_slice_pitch = 0; desc.num_mip_levels = 0; desc.num_samples = 0;
desc.image_width = K; desc.image_height = M/4;
desc.buffer = A;
desc.image_row_pitch = desc.image_width*8;
A = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
assert(err == 0);
desc.image_width = K; desc.image_height = N/4;
desc.buffer = B; desc.image_row_pitch = desc.image_width*8;
B = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
assert(err == 0);
desc.image_width = M/4; desc.image_height = N;
desc.buffer = C; desc.image_row_pitch = desc.image_width*8;
C = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
assert(err == 0);
printf("created images\n");
#endif
clSetKernelArg(kern, 0, sizeof(int), &M);
clSetKernelArg(kern, 1, sizeof(int), &N);
clSetKernelArg(kern, 2, sizeof(int), &K);
clSetKernelArg(kern, 3, sizeof(cl_mem), &A);
clSetKernelArg(kern, 4, sizeof(cl_mem), &B);
clSetKernelArg(kern, 5, sizeof(cl_mem), &C);
printf("set args\n");
#ifdef IMAGE
size_t global_work_size[3] = {M/4, N/4, 1};
size_t local_work_size[3] = {4, 64, 1};
#else
size_t global_work_size[3] = {128, 128, 1};
size_t local_work_size[3] = {2, 128, 1};
#endif
#else
cl_kernel kern = clCreateKernel(prog, "convolution_horizontal_reduced_reads_1x1", &err);
assert(err == 0);
printf("creating kernel\n");
cl_mem input;
cl_mem weights;
cl_mem weights_buffer;
cl_mem biases;
cl_mem outputs;
cl_image_format fmt;
fmt.image_channel_order = CL_RGBA;
fmt.image_channel_data_type = CL_HALF_FLOAT;
cl_image_desc desc;
desc.image_type = CL_MEM_OBJECT_IMAGE2D;
desc.image_depth = 0; desc.image_slice_pitch = 0; desc.num_mip_levels = 0; desc.num_samples = 0;
desc.buffer = NULL;
biases = clCreateBuffer(context, CL_MEM_READ_WRITE, 1408, NULL, &err);
assert(err == 0);
desc.image_width = 8448; desc.image_height = 8; desc.image_row_pitch = 67840;
desc.buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, desc.image_height * desc.image_row_pitch, NULL, &err);
assert(err == 0);
input = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
assert(err == 0);
desc.image_width = 2112; desc.image_height = 88; desc.image_row_pitch = 16896;
weights_buffer = desc.buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, desc.image_height * desc.image_row_pitch, NULL, &err);
assert(err == 0);
weights = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
assert(err == 0);
desc.image_width = 1408; desc.image_height = 8; desc.image_row_pitch = 11264;
desc.buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, desc.image_height * desc.image_row_pitch, NULL, &err);
assert(err == 0);
outputs = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
assert(err == 0);
void *n = NULL;
uint16_t v;
float fl;
clSetKernelArg(kern, 0, sizeof(cl_mem), &input);
v = 0; clSetKernelArg(kern, 1, sizeof(v), &v);
v = 528; clSetKernelArg(kern, 2, sizeof(v), &v);
v = 528; clSetKernelArg(kern, 3, sizeof(v), &v);
v = 0; clSetKernelArg(kern, 4, sizeof(v), &v);
v = 88; clSetKernelArg(kern, 5, sizeof(v), &v);
clSetKernelArg(kern, 6, sizeof(cl_mem), &weights);
//clSetKernelArg(kern, 6, sizeof(cl_mem), &weights_buffer);
clSetKernelArg(kern, 7, sizeof(cl_mem), &biases);
v = 1; clSetKernelArg(kern, 8, sizeof(v), &v);
v = 1; clSetKernelArg(kern, 9, sizeof(v), &v);
clSetKernelArg(kern, 10, sizeof(cl_mem), &outputs);
v = 0; clSetKernelArg(kern, 11, sizeof(v), &v);
v = 0; clSetKernelArg(kern, 12, sizeof(v), &v);
v = 1; clSetKernelArg(kern, 13, sizeof(v), &v);
v = 1; clSetKernelArg(kern, 14, sizeof(v), &v);
v = 0; clSetKernelArg(kern, 15, sizeof(v), &v);
fl = 1.0; clSetKernelArg(kern, 16, sizeof(fl), &fl);
fl = 0.0; clSetKernelArg(kern, 17, sizeof(fl), &fl);
fl = 0.0; clSetKernelArg(kern, 18, sizeof(fl), &fl);
fl = 0.0; clSetKernelArg(kern, 19, sizeof(fl), &fl);
clSetKernelArg(kern, 20, sizeof(n), &n);
clSetKernelArg(kern, 21, sizeof(n), &n);
v = 16; clSetKernelArg(kern, 22, sizeof(v), &v);
size_t global_work_size[3] = {88, 4, 8};
size_t local_work_size[3] = {4, 4, 8};
#endif
printf("ready to enqueue\n");
for (int i = 0; i < 20; i++) {
cl_event event;
err = clEnqueueNDRangeKernel(q, kern, 3, NULL, global_work_size, local_work_size, 0, NULL, &event);
assert(err == 0);
uint64_t tb = nanos_since_boot();
err = clWaitForEvents(1, &event);
assert(err == 0);
uint64_t te = nanos_since_boot();
uint64_t us = (te-tb)/1000;
float s = 1000000.0/us;
#ifdef GEMM
float flops = M*N*K*s;
float rams = (M*N + N*K + M*K)*s;
#else
float flops = 95158272.0*s;
float rams = 1058816.0*s;
//float rams = 6735872.0*s;
#endif
printf("%2d: wait %lu us -- %.2f GFLOPS -- %.2f GB/s\n", i, us, flops/1e9, rams*2/1e9);
}
size_t binary_size = 0;
err = clGetProgramInfo(prog, CL_PROGRAM_BINARY_SIZES, sizeof(binary_size), &binary_size, NULL);
assert(err == 0);
assert(binary_size > 0);
uint8_t *binary_buf = (uint8_t *)malloc(binary_size);
assert(binary_buf);
uint8_t* bufs[1] = { binary_buf, };
err = clGetProgramInfo(prog, CL_PROGRAM_BINARIES, sizeof(bufs), &bufs, NULL);
assert(err == 0);
FILE *g = fopen("/tmp/bin.bin", "wb");
fwrite(binary_buf, 1, binary_size, g);
fclose(g);
/*dump_maps();
for (uint64_t i = 0x7ffbd2000; i < 0x800000000; i += 0x1000) {
uint64_t cmd = *((uint64_t*)i);
printf("%llx: %llx\n", i, cmd);
}*/
return 0;
}