You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
26 lines
817 B
26 lines
817 B
import os
|
|
os.environ["NVIDIA_TF32_OVERRIDE"] = "0"
|
|
os.environ["MKL_NUM_THREADS"] = "1"
|
|
os.environ["NUMEXPR_NUM_THREADS"] = "1"
|
|
os.environ["OMP_NUM_THREADS"] = "1"
|
|
import time
|
|
import torch
|
|
torch.set_num_threads(1)
|
|
from tinygrad.helpers import getenv
|
|
CUDA = getenv("CUDA", 1)
|
|
|
|
for dtype in [torch.float32, torch.float16]:
|
|
for N in [256, 512, 1024, 2048, 4096]:
|
|
FLOPS = N*N*N*2
|
|
|
|
b = torch.rand((N,N), dtype=dtype)
|
|
c = torch.rand((N,N), dtype=dtype)
|
|
if CUDA: b,c = b.cuda(),c.cuda()
|
|
|
|
def torch_prog(b, c):
|
|
st = time.perf_counter()
|
|
a = b@c
|
|
if CUDA: torch.cuda.synchronize()
|
|
return time.perf_counter() - st
|
|
tm = min([torch_prog(b, c) for _ in range(20)])
|
|
print(f"{N*N:10d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS {N:4d}x{N:4d}x{N:4d} matmul in {dtype}")
|
|
|