import os os.environ["NVIDIA_TF32_OVERRIDE"] = "0" os.environ["MKL_NUM_THREADS"] = "1" os.environ["NUMEXPR_NUM_THREADS"] = "1" os.environ["OMP_NUM_THREADS"] = "1" import time import torch torch.set_num_threads(1) from tinygrad.helpers import getenv CUDA = getenv("CUDA", 1) for dtype in [torch.float32, torch.float16]: for N in [256, 512, 1024, 2048, 4096]: FLOPS = N*N*N*2 b = torch.rand((N,N), dtype=dtype) c = torch.rand((N,N), dtype=dtype) if CUDA: b,c = b.cuda(),c.cuda() def torch_prog(b, c): st = time.perf_counter() a = b@c if CUDA: torch.cuda.synchronize() return time.perf_counter() - st tm = min([torch_prog(b, c) for _ in range(20)]) print(f"{N*N:10d} {tm*1e6:9.2f} us, would be {FLOPS*1e-9/tm:9.2f} GFLOPS {N:4d}x{N:4d}x{N:4d} matmul in {dtype}")