#!/usr/bin/env python3 # cd extra/disassemblers/ && git clone --recursive github.com:geohot/cuda_ioctl_sniffer.git # LD_PRELOAD=$PWD/extra/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py import numpy as np from tinygrad.tensor import Tensor from tinygrad.helpers import colored, Timing, getenv from tinygrad.device import Device d0, d1 = f'{Device.DEFAULT}:0', f'{Device.DEFAULT}:1' def sync(): Device[d0].synchronize() Device[d1].synchronize() if __name__ == "__main__": print("GPU devices", d0, d1) sz = getenv("N", 1024*1024*256) # 1 GB with Timing("GPU initial sync: "): sync() with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"): c0 = (Tensor.ones(sz, device="clang")/2).realize() c1 = (Tensor.ones(sz, device="clang")/4).realize() print(c0.lazydata.base.realized) print(c1.lazydata.base.realized) with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"): a0 = c0.to(d0).realize() sync() with Timing("CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"): b1 = c1.to(d1).realize() sync() # cross copy. this is (sometimes) going through the CPU with Timing("0 -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"): a1 = a0.to(d1).realize() sync() with Timing("1 -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"): b0 = b1.to(d0).realize() sync() # sum with Timing("0+0 -> 0 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"): ab0 = (a0 + b0).realize() sync() with Timing("1+1 -> 1 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"): ab1 = (a1 + b1).realize() sync() # cross device sum (does this work?) with Timing(colored("0+1 -> 0 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"): abx0 = (a0 + b1.to(d0)).realize() sync() with Timing(colored("1+0 -> 1 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"): abx1 = (b1 + a0.to(d1)).realize() sync() # copy back # NOTE: half of this slowness is caused by allocating memory on the CPU with Timing("0 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"): cc0 = ab0.numpy() with Timing("1 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"): cc1 = ab1.numpy() # same print("testing") np.testing.assert_allclose(cc0, cc1) # same (cross) print("testing (cross)") np.testing.assert_allclose(cc0, abx0.numpy()) np.testing.assert_allclose(cc0, abx1.numpy()) # devices print(ab0) print(ab1) print(abx0) print(abx1)