openpilot is an open source driver assistance system. openpilot performs the functions of Automated Lane Centering and Adaptive Cruise Control for over 200 supported car makes and models.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

194 lines
4.4 KiB

// single: clang -O2 -march=native gemm.c
// multi: clang -O2 -march=native gemm.c -DNTHREADS=32 -lpthread
#define _GNU_SOURCE
// https://en.wikichip.org/wiki/amd/microarchitectures/zen_2
#include <stdint.h>
#include <time.h>
#include <sched.h>
#include <stdio.h>
#include <assert.h>
#include <math.h>
#include <string.h>
#include <immintrin.h>
#include <pthread.h>
#include <unistd.h>
#include <stdatomic.h>
//#define DEBUG
#ifdef DEBUG
#define N 8
#endif
#ifndef N
// NOTE: if you change this you have to rerun gemm.py
#define N 512
#endif
#ifndef NTHREADS
#define NTHREADS 1
#endif
// aligned?
float A[N*N] __attribute__ ((aligned (64)));
float B[N*N] __attribute__ ((aligned (64)));
float C[N*N] __attribute__ ((aligned (64)));
float val[N*N] __attribute__ ((aligned (64)));
__m256 *Am = (__m256*)A;
__m256 *Bm = (__m256*)B;
__m256 *Cm = (__m256*)C;
uint64_t nanos() {
struct timespec start;
clock_gettime(CLOCK_MONOTONIC_RAW, &start);
return (uint64_t)start.tv_sec*1000000000 + (uint64_t)start.tv_nsec;
}
float Bf[N*N] __attribute__ ((aligned (64)));
__m256 *Bfm = (__m256*)Bf;
#define BLOCK 8
#define BLOCK_Y 4
#define BLOCK_X 2
void matmul(int sy, int ey) {
// 136.77 GFLOPS on single core numpy
// 4.9 GHz is max boost for 5950X
// 32 FLOPS/cycle (16 FMAs, aka 2x 8 single wide / 32 byte FMAs)
// theoretical max is 156.8 GFLOPS, we see 150
// multicore theo max = 2508.8 GFLOPS, we see 1501.434299
// Bf = (y/8, k, 8)
for (int y = sy; y < ey; y+=BLOCK_Y) {
for (int x = 0; x < N; x+=BLOCK*BLOCK_X) {
__m256 acc[BLOCK_Y][BLOCK_X] = {};
for (int k = 0; k < N; k++) {
for (int iy = 0; iy < BLOCK_Y; iy++) {
__m256 ta = _mm256_broadcast_ss(&A[(y+iy)*N + k]);
for (int ix = 0; ix < BLOCK_X; ix++) {
acc[iy][ix] = _mm256_fmadd_ps(ta, Bfm[((x+ix*BLOCK)*N + k*8)/8], acc[iy][ix]);
}
}
}
for (int iy = 0; iy < BLOCK_Y; iy++) {
for (int ix = 0; ix < BLOCK_X; ix++) {
Cm[((y+iy)*N + x + ix * BLOCK)/8] = acc[iy][ix];
}
}
}
}
}
pthread_mutex_t lock = PTHREAD_MUTEX_INITIALIZER;
atomic_int nready = 0;
atomic_int ndone = 0;
void *matmul_thread(void *n) {
int k = (int)(int64_t)n;
int sy = (N/NTHREADS) * k;
int ey = (N/NTHREADS) * (k+1);
cpu_set_t set;
CPU_ZERO(&set);
CPU_SET(k,&set);
pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &set);
nready++;
// gotta have main lock once to signal start
pthread_mutex_lock(&lock);
pthread_mutex_unlock(&lock);
matmul(sy, ey);
// we done
ndone++;
return NULL;
}
int main() {
printf("hello with %d threads\n", NTHREADS);
#ifdef DEBUG
for (int i = 0; i < N*N; i++) A[i] = i;
for (int i = 0; i < N*N; i++) B[i] = i;
#else
FILE *f = fopen("/tmp/matmul", "rb");
if (f == NULL) {
printf("please pregenerate python /tmp/matmul file\n");
return -1;
}
fread(A, 1, sizeof(float)*N*N, f);
fread(B, 1, sizeof(float)*N*N, f);
fread(val, 1, sizeof(float)*N*N, f);
fclose(f);
#endif
// preswizzle
for (int y = 0; y < N; y+=8) {
for (int x = 0; x < N; x++) {
for (int iy = 0; iy < 8; iy++) {
Bf[y*N + x*8 + iy] = B[(y+iy)*N + x];
}
}
}
for (int i = 0; i < 10; i++) {
memset(C, 0, N*N*sizeof(float));
#if NTHREADS != 1
nready = 0;
ndone = 0;
pthread_mutex_lock(&lock);
pthread_t threads[NTHREADS];
for (int j = 0; j < NTHREADS; j++) {
pthread_create(&threads[j], NULL, matmul_thread, (void *)(uint64_t)j);
}
while (nready != NTHREADS) usleep(1);
#endif
uint64_t start = nanos();
#if NTHREADS == 1
matmul(0, N);
#else
// unlocking mutex starts threads
pthread_mutex_unlock(&lock);
while (ndone != NTHREADS) usleep(1);
#endif
uint64_t end = nanos();
#if NTHREADS != 1
for (int j = 0; j < NTHREADS; j++) {
pthread_join(threads[j], NULL);
}
#endif
double gflop = (2.0*N*N*N)*1e-9;
double s = (end-start)*1e-9;
printf("%f GFLOP/S -- %.2f ms\n", gflop/s, s*1e3);
// hack around throttling
//if (i%4 == 0) sleep(1);
}
#ifdef DEBUG
for (int i = 0; i < N*N; i++) {
if (i%N == 0 && i != 0) printf("\n");
printf("%f ", C[i]);
}
printf("\n");
#else
for (int k = 0; k < N*N; k++) {
if (fabsf(C[k] - val[k]) > 1e-3) {
printf("MISMATCH AT %d, %f != %f\n", k, C[k], val[k]);
return -1;
}
}
printf("match\n");
#endif
return 0;
}