You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							154 lines
						
					
					
						
							5.3 KiB
						
					
					
				
			
		
		
	
	
							154 lines
						
					
					
						
							5.3 KiB
						
					
					
				| # copying the kernels from https://github.com/microsoft/ArchProbe into Python
 | |
| import numpy as np
 | |
| import pickle
 | |
| from tinygrad.runtime.ops_cl import CLProgram, CLBuffer
 | |
| from tinygrad import dtypes
 | |
| from tqdm import trange, tqdm
 | |
| from matplotlib import pyplot as plt
 | |
| 
 | |
| tests = {}
 | |
| def register_test(fxn):
 | |
|   tests[fxn.__name__] = fxn
 | |
| 
 | |
| def warp_size2(nthread):
 | |
|   prg = """__kernel void warp_size2(
 | |
|     __global float* src,
 | |
|     __global int* dst,
 | |
|     const int niter,
 | |
|     const int prime_number
 | |
|   ) {
 | |
|     int drain = 0;
 | |
|     for (int j = 0; j < niter; ++j) {
 | |
|       drain += j / prime_number;
 | |
|       barrier(0);
 | |
|     }
 | |
|     dst[get_local_id(0)] = drain;
 | |
|   }"""
 | |
|   src_buf = CLBuffer(1, dtypes.float32)
 | |
|   dst_buf = CLBuffer(1, dtypes.int32)
 | |
|   cl = CLProgram("warp_size2", prg, argdtypes=[None, None, np.int32, np.int32])
 | |
|   return min([cl([nthread, 1024, 1], [nthread, 1, 1], src_buf, dst_buf, 10, 3, wait=True) for _ in range(5)])*1e9
 | |
| 
 | |
| @register_test
 | |
| def test_warp_size():
 | |
|   return [(nthread, warp_size2(nthread)) for nthread in trange(1,256)]
 | |
| 
 | |
| def reg_count(nthread, ngrp, nreg):
 | |
|   reg_declr = ''.join([f"float reg_data{i} = (float)niter + {i};\n" for i in range(nreg)])
 | |
|   reg_comp = ''.join([f"reg_data{i} *= {(i-1)%nreg};\n" for i in range(nreg)])
 | |
|   reg_reduce = ''.join([f"out_buf[{i}] = reg_data{i};\n" for i in range(nreg)])
 | |
|   prg = f"""__kernel void reg_count(
 | |
|     __global float* out_buf,
 | |
|     __private const int niter
 | |
|   ) {{
 | |
|     {reg_declr}
 | |
|     int i = 0;
 | |
|     for (; i < niter; ++i) {{
 | |
|       {reg_comp}
 | |
|     }}
 | |
|     i = i >> 31;
 | |
|     {reg_reduce}
 | |
|   }}"""
 | |
|   out_buf = CLBuffer(1, dtypes.float32)
 | |
|   cl = CLProgram("reg_count", prg, argdtypes=[None, np.int32])
 | |
|   return min([cl([nthread, ngrp, 1], [nthread, 1, 1], out_buf, 20, wait=True) for _ in range(10)])*1e9
 | |
| 
 | |
| @register_test
 | |
| def test_reg_count(nthread=1, ngrp=1):
 | |
|   base = reg_count(nthread, ngrp, 1)
 | |
|   return [(nreg, (reg_count(nthread, ngrp, nreg)-base)/nreg) for nreg in trange(4, 513, 4)]
 | |
| 
 | |
| def buf_cache_hierarchy_pchase(ndata, stride=1, NCOMP=1, steps=65536):
 | |
|   ndata //= NCOMP*4  # ptr size
 | |
|   prg = f"""__kernel void buf_cache_hierarchy_pchase(
 | |
|     __global int{str(NCOMP) if NCOMP > 1 else ''}* src,
 | |
|     __global int* dst,
 | |
|     const int niter
 | |
|   ) {{
 | |
|     int idx = 0;
 | |
|     for (int i = 0; i < niter; ++i) {{
 | |
|       idx = src[idx]{'.x' if NCOMP > 1 else ''};
 | |
|     }}
 | |
|     *dst = idx;
 | |
|   }}"""
 | |
|   idx_buf = np.zeros(ndata*NCOMP, dtype=np.int32)
 | |
|   for i in range(ndata): idx_buf[i*NCOMP] = (i + stride) % ndata
 | |
|   in_buf = CLBuffer.fromCPU(idx_buf)
 | |
|   out_buf = CLBuffer(1, dtypes.int32)
 | |
|   cl = CLProgram("buf_cache_hierarchy_pchase", prg, argdtypes=[None, None, np.int32])
 | |
|   return min([cl([1, 1, 1], [1, 1, 1], in_buf, out_buf, steps, wait=True)/steps for _ in range(5)])*1e9
 | |
| 
 | |
| @register_test
 | |
| def test_memory_latency():
 | |
|   # requires cacheline < 16
 | |
|   szs = [int(1.3**x) for x in range(20, 70)]
 | |
|   return [(ndata, buf_cache_hierarchy_pchase(ndata, NCOMP=16, steps=128*1024)) for ndata in tqdm(szs)]
 | |
| 
 | |
| @register_test
 | |
| def test_cacheline_size():
 | |
|   # TODO: this buffer must be at least 2x the L1 cache for this test to work
 | |
|   return [(stride, buf_cache_hierarchy_pchase(4*65536, stride, steps=65536)) for stride in trange(1,64)]
 | |
| 
 | |
| def cl_read(sz, niter=1):
 | |
|   prg = f"""__kernel void copy(
 | |
|     __global float4* src,
 | |
|     __global float* dst) {{
 | |
|       int gid = get_global_id(0);
 | |
|       if (src[gid].x == 99+get_global_id(1)) *dst = 1;
 | |
|   }}"""
 | |
| 
 | |
|   in_buf = CLBuffer(sz//4, dtypes.float32)
 | |
|   out_buf = CLBuffer(1, dtypes.float32)
 | |
|   cl = CLProgram("copy", prg)
 | |
|   # NOTE: if nay of the niters form a local group, this is wrong
 | |
|   return min([cl([sz//16, niter, 1], [1, 1, 1], in_buf, out_buf, wait=True) for _ in range(10)])*1e9
 | |
| 
 | |
| @register_test
 | |
| def test_read_bandwidth():
 | |
|   szs = list(range(128*1024, 20*1024*1024, 128*1024))
 | |
|   NITER = 8
 | |
|   base = cl_read(16, niter=NITER)
 | |
|   return [(sz, (sz*NITER)/(cl_read(sz, niter=NITER)-base)) for sz in tqdm(szs)]
 | |
| 
 | |
| 
 | |
| def gflops(niter=4, nroll=4, ngroups=4096):
 | |
|   NCOMP = 8
 | |
|   prg = f"""__kernel void gflops(
 | |
|     __global float* out_buf
 | |
|     ) {{
 | |
|       float{NCOMP} x = (float{NCOMP})({",".join(f"get_local_id(0)+{i}" for i in range(NCOMP))});
 | |
|       float{NCOMP} y = (float{NCOMP})({",".join(f"get_local_id(1)+{i}" for i in range(NCOMP))});
 | |
| 
 | |
|       for (int i = 0; i < {niter}; i++) {{
 | |
|         {''.join(['x = mad(y, y, x); y = mad(x, x, y);'+chr(10)]*nroll)}
 | |
|       }}
 | |
| 
 | |
|       out_buf[get_global_id(0) >> 31] = {'+'.join(f"y.s{'0123456789abcdef'[i]}" for i in range(NCOMP))};
 | |
|   }}"""
 | |
|   out_buf = CLBuffer(1, dtypes.float32)
 | |
|   cl = CLProgram("gflops", prg, options="-cl-mad-enable -cl-fast-relaxed-math")
 | |
|   FLOPS = NCOMP*2*2 * niter * nroll * ngroups * 32
 | |
|   # NOTE: if nay of the niters form a local group, this is wrong
 | |
|   return FLOPS/(min([cl([32, ngroups, 1], [32, 1, 1], out_buf, wait=True) for _ in range(10)])*1e9)
 | |
| 
 | |
| @register_test
 | |
| def test_gflops():
 | |
|   return [(niter, gflops(niter=niter, nroll=32)) for niter in trange(1, 32, 1)]
 | |
| 
 | |
| if __name__ == "__main__":
 | |
|   cache = {}
 | |
|   #cache = pickle.load(open("/tmp/cache.pkl", "rb"))
 | |
|   #tests = {"test_cacheline_size": tests["test_cacheline_size"]}
 | |
|   plt.figure(figsize=(16, 9))
 | |
|   for i,(k,test) in enumerate(tests.items()):
 | |
|     print(f"running {k}")
 | |
|     plt.subplot(2, (len(tests)+1)//2, i+1)
 | |
|     plt.title(k)
 | |
|     if k == "test_memory_latency": plt.xscale('log')
 | |
|     if k not in cache: cache[k] = test()
 | |
|     plt.plot(*zip(*cache[k]))
 | |
|   #pickle.dump(cache, open("/tmp/cache.pkl", "wb"))
 | |
| 
 | |
|   plt.tight_layout(pad=0.5)
 | |
|   plt.savefig("/tmp/results.png")
 | |
|   plt.show()
 | |
| 
 |