#include #include #include #include #include /* block7b_project_conv (Conv2D) (None, 8, 16, 352) 743424 block7b_activation[0][0] 8448*8*4 = 8*16*2112 = 270336 = input = 128*2112 2112*88*4 = 743424 = weights = 2112*352 1408*8*4 = 8*16*352 = 45056 = output = 128*352 FLOPS = 128*2112*352 = 95158272 = 95 MFLOPS RAM = 128*2112 + 2112*352 + 128*352 = 1058816 = 1 M accesses # 22 groups 128*2112 + 2112*16 + 128*16 = 306176 306176*22 = 6735872 real accesses This is a 128x2112 by 2112x352 matrix multiply work_size = {88, 4, 8} Each kernel run computes 16 outputs 0x7f7e8a6380 convolution_horizontal_reduced_reads_1x1 -- 88 4 8 -- 4 4 8 image2d_t input = 0x7f7f490b00 image 8448 x 8 rp 67840 short startPackedInputChannel = 0 short numPackedInputChannelsForGroup = 528 short totalNumPackedInputChannels = 528 short packedOuputChannelOffset = 0 short totalNumPackedOutputChannels = 88 image2d_t weights = 0x7f7f52fb80 image 2112 x 88 rp 16896 float* biases = 0x7f7f564d80 buffer 1408 short filterSizeX = 1 short filterSizeY = 1 image2d_t output = 0x7f7f490e80 image 1408 x 8 rp 11264 short paddingX = 0 short paddingY = 0 short strideX = 1 short strideY = 1 short neuron = 0 float a = 1.000000 float b = 1.000000 float min_clamp = 0.000000 float max_clamp = 0.000000 float* parameters = 0x0 float* batchNormBiases = 0x0 short numOutputColumns = 16 */ #define GEMM #define IMAGE void dump_maps() { FILE *f = fopen("/proc/self/maps", "rb"); char maps[0x100000]; int len = fread(maps, 1, sizeof(maps), f); maps[len] = '\0'; maps[0x800] = '\0'; fclose(f); printf("%s\n", maps); } static inline uint64_t nanos_since_boot() { struct timespec t; clock_gettime(CLOCK_BOOTTIME, &t); return t.tv_sec * 1000000000ULL + t.tv_nsec; } int main(int argc, char *argv[]) { cl_int err; // cl init cl_device_id device_id; cl_context context; cl_command_queue q; { cl_platform_id platform_id[2]; cl_uint num_devices; cl_uint num_platforms; err = clGetPlatformIDs(sizeof(platform_id)/sizeof(cl_platform_id), platform_id, &num_platforms); assert(err == 0); err = clGetDeviceIDs(platform_id[0], CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &num_devices); assert(err == 0); context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err); assert(err == 0); q = clCreateCommandQueue(context, device_id, 0, &err); assert(err == 0); } printf("cl ready\n"); char tmp[0x10000]; memset(tmp, 0, sizeof(tmp)); FILE *f = fopen(argv[1], "rb"); fread(tmp, 1, sizeof(tmp), f); fclose(f); const char *strings[1]; size_t lengths[1]; strings[0] = tmp; lengths[0] = strlen(tmp); cl_program prog = clCreateProgramWithSource(context, 1, strings, lengths, &err); assert(err == 0); printf("creating program\n"); err = clBuildProgram(prog, 1, &device_id, "-D AVANTE_IS_GPU_A530_64", NULL, NULL); if (err != 0) { printf("got err %d\n", err); size_t length; char buffer[2048]; clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &length); buffer[length] = '\0'; printf("%s\n", buffer); } assert(err == 0); printf("built program\n"); #ifdef GEMM // 128x2112 by 2112x352 int M,N,K; M = N = K = 1024; //M = 128; K = 2112; N = 352; cl_kernel kern = clCreateKernel(prog, "gemm", &err); assert(err == 0); printf("creating kernel %p\n", kern); cl_mem A,B,C; A = clCreateBuffer(context, CL_MEM_READ_WRITE, M*K*2, NULL, &err); assert(err == 0); B = clCreateBuffer(context, CL_MEM_READ_WRITE, K*N*2, NULL, &err); assert(err == 0); C = clCreateBuffer(context, CL_MEM_READ_WRITE, M*N*2, NULL, &err); assert(err == 0); printf("created buffers\n"); #ifdef IMAGE cl_image_format fmt; fmt.image_channel_order = CL_RGBA; fmt.image_channel_data_type = CL_HALF_FLOAT; cl_image_desc desc; desc.image_type = CL_MEM_OBJECT_IMAGE2D; desc.image_depth = 0; desc.image_slice_pitch = 0; desc.num_mip_levels = 0; desc.num_samples = 0; desc.image_width = K; desc.image_height = M/4; desc.buffer = A; desc.image_row_pitch = desc.image_width*8; A = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err); assert(err == 0); desc.image_width = K; desc.image_height = N/4; desc.buffer = B; desc.image_row_pitch = desc.image_width*8; B = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err); assert(err == 0); desc.image_width = M/4; desc.image_height = N; desc.buffer = C; desc.image_row_pitch = desc.image_width*8; C = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err); assert(err == 0); printf("created images\n"); #endif clSetKernelArg(kern, 0, sizeof(int), &M); clSetKernelArg(kern, 1, sizeof(int), &N); clSetKernelArg(kern, 2, sizeof(int), &K); clSetKernelArg(kern, 3, sizeof(cl_mem), &A); clSetKernelArg(kern, 4, sizeof(cl_mem), &B); clSetKernelArg(kern, 5, sizeof(cl_mem), &C); printf("set args\n"); #ifdef IMAGE size_t global_work_size[3] = {M/4, N/4, 1}; size_t local_work_size[3] = {4, 64, 1}; #else size_t global_work_size[3] = {128, 128, 1}; size_t local_work_size[3] = {2, 128, 1}; #endif #else cl_kernel kern = clCreateKernel(prog, "convolution_horizontal_reduced_reads_1x1", &err); assert(err == 0); printf("creating kernel\n"); cl_mem input; cl_mem weights; cl_mem weights_buffer; cl_mem biases; cl_mem outputs; cl_image_format fmt; fmt.image_channel_order = CL_RGBA; fmt.image_channel_data_type = CL_HALF_FLOAT; cl_image_desc desc; desc.image_type = CL_MEM_OBJECT_IMAGE2D; desc.image_depth = 0; desc.image_slice_pitch = 0; desc.num_mip_levels = 0; desc.num_samples = 0; desc.buffer = NULL; biases = clCreateBuffer(context, CL_MEM_READ_WRITE, 1408, NULL, &err); assert(err == 0); desc.image_width = 8448; desc.image_height = 8; desc.image_row_pitch = 67840; desc.buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, desc.image_height * desc.image_row_pitch, NULL, &err); assert(err == 0); input = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err); assert(err == 0); desc.image_width = 2112; desc.image_height = 88; desc.image_row_pitch = 16896; weights_buffer = desc.buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, desc.image_height * desc.image_row_pitch, NULL, &err); assert(err == 0); weights = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err); assert(err == 0); desc.image_width = 1408; desc.image_height = 8; desc.image_row_pitch = 11264; desc.buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, desc.image_height * desc.image_row_pitch, NULL, &err); assert(err == 0); outputs = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err); assert(err == 0); void *n = NULL; uint16_t v; float fl; clSetKernelArg(kern, 0, sizeof(cl_mem), &input); v = 0; clSetKernelArg(kern, 1, sizeof(v), &v); v = 528; clSetKernelArg(kern, 2, sizeof(v), &v); v = 528; clSetKernelArg(kern, 3, sizeof(v), &v); v = 0; clSetKernelArg(kern, 4, sizeof(v), &v); v = 88; clSetKernelArg(kern, 5, sizeof(v), &v); clSetKernelArg(kern, 6, sizeof(cl_mem), &weights); //clSetKernelArg(kern, 6, sizeof(cl_mem), &weights_buffer); clSetKernelArg(kern, 7, sizeof(cl_mem), &biases); v = 1; clSetKernelArg(kern, 8, sizeof(v), &v); v = 1; clSetKernelArg(kern, 9, sizeof(v), &v); clSetKernelArg(kern, 10, sizeof(cl_mem), &outputs); v = 0; clSetKernelArg(kern, 11, sizeof(v), &v); v = 0; clSetKernelArg(kern, 12, sizeof(v), &v); v = 1; clSetKernelArg(kern, 13, sizeof(v), &v); v = 1; clSetKernelArg(kern, 14, sizeof(v), &v); v = 0; clSetKernelArg(kern, 15, sizeof(v), &v); fl = 1.0; clSetKernelArg(kern, 16, sizeof(fl), &fl); fl = 0.0; clSetKernelArg(kern, 17, sizeof(fl), &fl); fl = 0.0; clSetKernelArg(kern, 18, sizeof(fl), &fl); fl = 0.0; clSetKernelArg(kern, 19, sizeof(fl), &fl); clSetKernelArg(kern, 20, sizeof(n), &n); clSetKernelArg(kern, 21, sizeof(n), &n); v = 16; clSetKernelArg(kern, 22, sizeof(v), &v); size_t global_work_size[3] = {88, 4, 8}; size_t local_work_size[3] = {4, 4, 8}; #endif printf("ready to enqueue\n"); for (int i = 0; i < 20; i++) { cl_event event; err = clEnqueueNDRangeKernel(q, kern, 3, NULL, global_work_size, local_work_size, 0, NULL, &event); assert(err == 0); uint64_t tb = nanos_since_boot(); err = clWaitForEvents(1, &event); assert(err == 0); uint64_t te = nanos_since_boot(); uint64_t us = (te-tb)/1000; float s = 1000000.0/us; #ifdef GEMM float flops = M*N*K*s; float rams = (M*N + N*K + M*K)*s; #else float flops = 95158272.0*s; float rams = 1058816.0*s; //float rams = 6735872.0*s; #endif printf("%2d: wait %lu us -- %.2f GFLOPS -- %.2f GB/s\n", i, us, flops/1e9, rams*2/1e9); } size_t binary_size = 0; err = clGetProgramInfo(prog, CL_PROGRAM_BINARY_SIZES, sizeof(binary_size), &binary_size, NULL); assert(err == 0); assert(binary_size > 0); uint8_t *binary_buf = (uint8_t *)malloc(binary_size); assert(binary_buf); uint8_t* bufs[1] = { binary_buf, }; err = clGetProgramInfo(prog, CL_PROGRAM_BINARIES, sizeof(bufs), &bufs, NULL); assert(err == 0); FILE *g = fopen("/tmp/bin.bin", "wb"); fwrite(binary_buf, 1, binary_size, g); fclose(g); /*dump_maps(); for (uint64_t i = 0x7ffbd2000; i < 0x800000000; i += 0x1000) { uint64_t cmd = *((uint64_t*)i); printf("%llx: %llx\n", i, cmd); }*/ return 0; }