You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							314 lines
						
					
					
						
							9.4 KiB
						
					
					
				
			
		
		
	
	
							314 lines
						
					
					
						
							9.4 KiB
						
					
					
				| #include <stdio.h>
 | |
| #include <stdlib.h>
 | |
| #include <CL/cl.h>
 | |
| #include <assert.h>
 | |
| #include <time.h>
 | |
| 
 | |
| /*
 | |
| block7b_project_conv (Conv2D)   (None, 8, 16, 352)   743424      block7b_activation[0][0]
 | |
| 8448*8*4 = 8*16*2112 = 270336 = input = 128*2112
 | |
| 2112*88*4 = 743424 = weights = 2112*352
 | |
| 1408*8*4 = 8*16*352 = 45056 = output = 128*352
 | |
| 
 | |
| FLOPS = 128*2112*352 = 95158272 = 95 MFLOPS
 | |
| RAM = 128*2112 + 2112*352 + 128*352 = 1058816 = 1 M accesses
 | |
| 
 | |
| # 22 groups
 | |
| 128*2112 + 2112*16 + 128*16 = 306176
 | |
| 306176*22 = 6735872 real accesses
 | |
| 
 | |
| This is a 128x2112 by 2112x352 matrix multiply
 | |
| 
 | |
| work_size = {88, 4, 8}
 | |
| Each kernel run computes 16 outputs
 | |
| 
 | |
| 0x7f7e8a6380                 convolution_horizontal_reduced_reads_1x1 --   88    4    8  --    4    4    8
 | |
|   image2d_t input = 0x7f7f490b00 image 8448 x 8 rp 67840
 | |
|   short startPackedInputChannel = 0
 | |
|   short numPackedInputChannelsForGroup = 528
 | |
|   short totalNumPackedInputChannels = 528
 | |
|   short packedOuputChannelOffset = 0
 | |
|   short totalNumPackedOutputChannels = 88
 | |
|   image2d_t weights = 0x7f7f52fb80 image 2112 x 88 rp 16896
 | |
|   float* biases = 0x7f7f564d80 buffer 1408
 | |
|   short filterSizeX = 1
 | |
|   short filterSizeY = 1
 | |
|   image2d_t output = 0x7f7f490e80 image 1408 x 8 rp 11264
 | |
|   short paddingX = 0
 | |
|   short paddingY = 0
 | |
|   short strideX = 1
 | |
|   short strideY = 1
 | |
|   short neuron = 0
 | |
|   float a = 1.000000
 | |
|   float b = 1.000000
 | |
|   float min_clamp = 0.000000
 | |
|   float max_clamp = 0.000000
 | |
|   float* parameters = 0x0
 | |
|   float* batchNormBiases = 0x0
 | |
|   short numOutputColumns = 16
 | |
| */
 | |
| 
 | |
| #define GEMM
 | |
| #define IMAGE
 | |
| 
 | |
| void dump_maps() {
 | |
|   FILE *f = fopen("/proc/self/maps", "rb");
 | |
|   char maps[0x100000];
 | |
|   int len = fread(maps, 1, sizeof(maps), f);
 | |
|   maps[len] = '\0';
 | |
|   maps[0x800] = '\0';
 | |
|   fclose(f);
 | |
|   printf("%s\n", maps);
 | |
| }
 | |
| 
 | |
| static inline uint64_t nanos_since_boot() {
 | |
|   struct timespec t;
 | |
|   clock_gettime(CLOCK_BOOTTIME, &t);
 | |
|   return t.tv_sec * 1000000000ULL + t.tv_nsec;
 | |
| }
 | |
| 
 | |
| int main(int argc, char *argv[]) {
 | |
|   cl_int err;
 | |
| 
 | |
|   // cl init
 | |
|   cl_device_id device_id;
 | |
|   cl_context context;
 | |
|   cl_command_queue q;
 | |
|   {
 | |
|     cl_platform_id platform_id[2];
 | |
|     cl_uint num_devices;
 | |
|     cl_uint num_platforms;
 | |
| 
 | |
|     err = clGetPlatformIDs(sizeof(platform_id)/sizeof(cl_platform_id), platform_id, &num_platforms);
 | |
|     assert(err == 0);
 | |
| 
 | |
|     err = clGetDeviceIDs(platform_id[0], CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &num_devices);
 | |
|     assert(err == 0);
 | |
| 
 | |
|     context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err);
 | |
|     assert(err == 0);
 | |
| 
 | |
|     q = clCreateCommandQueue(context, device_id, 0, &err);
 | |
|     assert(err == 0);
 | |
|   }
 | |
|   printf("cl ready\n");
 | |
| 
 | |
|   char tmp[0x10000];
 | |
|   memset(tmp, 0, sizeof(tmp));
 | |
|   FILE *f = fopen(argv[1], "rb");
 | |
|   fread(tmp, 1, sizeof(tmp), f);
 | |
|   fclose(f);
 | |
| 
 | |
|   const char *strings[1];
 | |
|   size_t lengths[1];
 | |
|   strings[0] = tmp;
 | |
|   lengths[0] = strlen(tmp);
 | |
| 
 | |
|   cl_program prog = clCreateProgramWithSource(context, 1, strings, lengths, &err);
 | |
|   assert(err == 0);
 | |
|   printf("creating program\n");
 | |
| 
 | |
|   err = clBuildProgram(prog, 1, &device_id, "-D AVANTE_IS_GPU_A530_64", NULL, NULL);
 | |
| 
 | |
|   if (err != 0) {
 | |
|     printf("got err %d\n", err);
 | |
|     size_t length;
 | |
|     char buffer[2048];
 | |
|     clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &length);
 | |
|     buffer[length] = '\0';
 | |
|     printf("%s\n", buffer);
 | |
|   }
 | |
|   assert(err == 0);
 | |
|   printf("built program\n");
 | |
| 
 | |
| 
 | |
| #ifdef GEMM
 | |
|   // 128x2112 by 2112x352
 | |
|   int M,N,K;
 | |
| 
 | |
|   M = N = K = 1024;
 | |
|   //M = 128; K = 2112; N = 352;
 | |
| 
 | |
|   cl_kernel kern = clCreateKernel(prog, "gemm", &err);
 | |
|   assert(err == 0);
 | |
|   printf("creating kernel %p\n", kern);
 | |
| 
 | |
|   cl_mem A,B,C;
 | |
|   A = clCreateBuffer(context, CL_MEM_READ_WRITE, M*K*2, NULL, &err);
 | |
|   assert(err == 0);
 | |
|   B = clCreateBuffer(context, CL_MEM_READ_WRITE, K*N*2, NULL, &err);
 | |
|   assert(err == 0);
 | |
|   C = clCreateBuffer(context, CL_MEM_READ_WRITE, M*N*2, NULL, &err);
 | |
|   assert(err == 0);
 | |
|   printf("created buffers\n");
 | |
| 
 | |
| #ifdef IMAGE
 | |
|   cl_image_format fmt;
 | |
|   fmt.image_channel_order = CL_RGBA;
 | |
|   fmt.image_channel_data_type = CL_HALF_FLOAT;
 | |
| 
 | |
|   cl_image_desc desc;
 | |
|   desc.image_type = CL_MEM_OBJECT_IMAGE2D;
 | |
|   desc.image_depth = 0; desc.image_slice_pitch = 0; desc.num_mip_levels = 0; desc.num_samples = 0;
 | |
| 
 | |
|   desc.image_width = K; desc.image_height = M/4;
 | |
|   desc.buffer = A;
 | |
|   desc.image_row_pitch = desc.image_width*8;
 | |
|   A = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
 | |
|   assert(err == 0);
 | |
| 
 | |
|   desc.image_width = K; desc.image_height = N/4;
 | |
|   desc.buffer = B; desc.image_row_pitch = desc.image_width*8;
 | |
|   B = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
 | |
|   assert(err == 0);
 | |
| 
 | |
|   desc.image_width = M/4; desc.image_height = N;
 | |
|   desc.buffer = C; desc.image_row_pitch = desc.image_width*8;
 | |
|   C = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
 | |
|   assert(err == 0);
 | |
|   printf("created images\n");
 | |
| #endif
 | |
| 
 | |
|   clSetKernelArg(kern, 0, sizeof(int), &M);
 | |
|   clSetKernelArg(kern, 1, sizeof(int), &N);
 | |
|   clSetKernelArg(kern, 2, sizeof(int), &K);
 | |
| 
 | |
|   clSetKernelArg(kern, 3, sizeof(cl_mem), &A);
 | |
|   clSetKernelArg(kern, 4, sizeof(cl_mem), &B);
 | |
|   clSetKernelArg(kern, 5, sizeof(cl_mem), &C);
 | |
|   printf("set args\n");
 | |
| 
 | |
| #ifdef IMAGE
 | |
|   size_t global_work_size[3] = {M/4, N/4, 1};
 | |
|   size_t local_work_size[3] = {4, 64, 1};
 | |
| #else
 | |
|   size_t global_work_size[3] = {128, 128, 1};
 | |
|   size_t local_work_size[3] = {2, 128, 1};
 | |
| #endif
 | |
| 
 | |
| #else
 | |
|   cl_kernel kern = clCreateKernel(prog, "convolution_horizontal_reduced_reads_1x1", &err);
 | |
|   assert(err == 0);
 | |
|   printf("creating kernel\n");
 | |
| 
 | |
|   cl_mem input;
 | |
|   cl_mem weights;
 | |
|   cl_mem weights_buffer;
 | |
|   cl_mem biases;
 | |
|   cl_mem outputs;
 | |
| 
 | |
|   cl_image_format fmt;
 | |
|   fmt.image_channel_order = CL_RGBA;
 | |
|   fmt.image_channel_data_type = CL_HALF_FLOAT;
 | |
| 
 | |
|   cl_image_desc desc;
 | |
|   desc.image_type = CL_MEM_OBJECT_IMAGE2D;
 | |
|   desc.image_depth = 0; desc.image_slice_pitch = 0; desc.num_mip_levels = 0; desc.num_samples = 0;
 | |
|   desc.buffer = NULL;
 | |
| 
 | |
|   biases = clCreateBuffer(context, CL_MEM_READ_WRITE, 1408, NULL, &err);
 | |
|   assert(err == 0);
 | |
| 
 | |
|   desc.image_width = 8448; desc.image_height = 8; desc.image_row_pitch = 67840;
 | |
|   desc.buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, desc.image_height * desc.image_row_pitch, NULL, &err);
 | |
|   assert(err == 0);
 | |
|   input = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
 | |
|   assert(err == 0);
 | |
| 
 | |
|   desc.image_width = 2112; desc.image_height = 88; desc.image_row_pitch = 16896;
 | |
|   weights_buffer = desc.buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, desc.image_height * desc.image_row_pitch, NULL, &err);
 | |
|   assert(err == 0);
 | |
|   weights = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
 | |
|   assert(err == 0);
 | |
| 
 | |
|   desc.image_width = 1408; desc.image_height = 8; desc.image_row_pitch = 11264;
 | |
|   desc.buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, desc.image_height * desc.image_row_pitch, NULL, &err);
 | |
|   assert(err == 0);
 | |
|   outputs = clCreateImage(context, CL_MEM_READ_WRITE, &fmt, &desc, NULL, &err);
 | |
|   assert(err == 0);
 | |
| 
 | |
|   void *n = NULL;
 | |
|   uint16_t v;
 | |
|   float fl;
 | |
| 
 | |
|   clSetKernelArg(kern, 0, sizeof(cl_mem), &input);
 | |
|   v = 0; clSetKernelArg(kern, 1, sizeof(v), &v);
 | |
|   v = 528; clSetKernelArg(kern, 2, sizeof(v), &v);
 | |
|   v = 528; clSetKernelArg(kern, 3, sizeof(v), &v);
 | |
|   v = 0; clSetKernelArg(kern, 4, sizeof(v), &v);
 | |
|   v = 88; clSetKernelArg(kern, 5, sizeof(v), &v);
 | |
|   clSetKernelArg(kern, 6, sizeof(cl_mem), &weights);
 | |
|   //clSetKernelArg(kern, 6, sizeof(cl_mem), &weights_buffer);
 | |
|   clSetKernelArg(kern, 7, sizeof(cl_mem), &biases);
 | |
|   v = 1; clSetKernelArg(kern, 8, sizeof(v), &v);
 | |
|   v = 1; clSetKernelArg(kern, 9, sizeof(v), &v);
 | |
|   clSetKernelArg(kern, 10, sizeof(cl_mem), &outputs);
 | |
|   v = 0; clSetKernelArg(kern, 11, sizeof(v), &v);
 | |
|   v = 0; clSetKernelArg(kern, 12, sizeof(v), &v);
 | |
|   v = 1; clSetKernelArg(kern, 13, sizeof(v), &v);
 | |
|   v = 1; clSetKernelArg(kern, 14, sizeof(v), &v);
 | |
|   v = 0; clSetKernelArg(kern, 15, sizeof(v), &v);
 | |
|   fl = 1.0; clSetKernelArg(kern, 16, sizeof(fl), &fl);
 | |
|   fl = 0.0; clSetKernelArg(kern, 17, sizeof(fl), &fl);
 | |
|   fl = 0.0; clSetKernelArg(kern, 18, sizeof(fl), &fl);
 | |
|   fl = 0.0; clSetKernelArg(kern, 19, sizeof(fl), &fl);
 | |
|   clSetKernelArg(kern, 20, sizeof(n), &n);
 | |
|   clSetKernelArg(kern, 21, sizeof(n), &n);
 | |
|   v = 16; clSetKernelArg(kern, 22, sizeof(v), &v);
 | |
| 
 | |
|   size_t global_work_size[3] = {88, 4, 8};
 | |
|   size_t local_work_size[3] = {4, 4, 8};
 | |
| #endif
 | |
| 
 | |
|   printf("ready to enqueue\n");
 | |
|   for (int i = 0; i < 20; i++) {
 | |
|     cl_event event;
 | |
|     err = clEnqueueNDRangeKernel(q, kern, 3, NULL, global_work_size, local_work_size, 0, NULL, &event);
 | |
|     assert(err == 0);
 | |
| 
 | |
|     uint64_t tb = nanos_since_boot();
 | |
|     err = clWaitForEvents(1, &event);
 | |
|     assert(err == 0);
 | |
|     uint64_t te = nanos_since_boot();
 | |
|     uint64_t us = (te-tb)/1000;
 | |
| 
 | |
|     float s = 1000000.0/us;
 | |
| 
 | |
| #ifdef GEMM
 | |
|     float flops = M*N*K*s;
 | |
|     float rams = (M*N + N*K + M*K)*s;
 | |
| #else
 | |
|     float flops = 95158272.0*s;
 | |
|     float rams = 1058816.0*s;
 | |
|     //float rams = 6735872.0*s;
 | |
| #endif
 | |
| 
 | |
|     printf("%2d: wait %lu us -- %.2f GFLOPS -- %.2f GB/s\n", i, us, flops/1e9, rams*2/1e9);
 | |
|   }
 | |
| 
 | |
|   size_t binary_size = 0;
 | |
|   err = clGetProgramInfo(prog, CL_PROGRAM_BINARY_SIZES, sizeof(binary_size), &binary_size, NULL);
 | |
|   assert(err == 0);
 | |
|   assert(binary_size > 0);
 | |
| 
 | |
|   uint8_t *binary_buf = (uint8_t *)malloc(binary_size);
 | |
|   assert(binary_buf);
 | |
| 
 | |
|   uint8_t* bufs[1] = { binary_buf, };
 | |
|   err = clGetProgramInfo(prog, CL_PROGRAM_BINARIES, sizeof(bufs), &bufs, NULL);
 | |
|   assert(err == 0);
 | |
| 
 | |
|   FILE *g = fopen("/tmp/bin.bin", "wb");
 | |
|   fwrite(binary_buf, 1, binary_size, g);
 | |
|   fclose(g);
 | |
| 
 | |
|   /*dump_maps();
 | |
|   for (uint64_t i = 0x7ffbd2000; i < 0x800000000; i += 0x1000) {
 | |
|     uint64_t cmd = *((uint64_t*)i);
 | |
|     printf("%llx: %llx\n", i, cmd);
 | |
|   }*/
 | |
| 
 | |
| 
 | |
|   return 0;
 | |
| }
 | |
| 
 | |
| 
 |