openpilot is an open source driver assistance system. openpilot performs the functions of Automated Lane Centering and Adaptive Cruise Control for over 200 supported car makes and models.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

79 lines
2.5 KiB

#!/usr/bin/env python3
# cd extra/disassemblers/ && git clone --recursive github.com:geohot/cuda_ioctl_sniffer.git
# LD_PRELOAD=$PWD/extra/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
import numpy as np
from tinygrad.tensor import Tensor
from tinygrad.helpers import colored, Timing, getenv
from tinygrad.device import Device
d0, d1 = f'{Device.DEFAULT}:0', f'{Device.DEFAULT}:1'
def sync():
Device[d0].synchronize()
Device[d1].synchronize()
if __name__ == "__main__":
print("GPU devices", d0, d1)
sz = getenv("N", 1024*1024*256) # 1 GB
with Timing("GPU initial sync: "): sync()
with Timing("CPU creation: ", on_exit=lambda x: f", {(sz*4*2)/x:.2f} GB/sec"):
c0 = (Tensor.ones(sz, device="clang")/2).realize()
c1 = (Tensor.ones(sz, device="clang")/4).realize()
print(c0.lazydata.base.realized)
print(c1.lazydata.base.realized)
with Timing("CPU -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
a0 = c0.to(d0).realize()
sync()
with Timing("CPU -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
b1 = c1.to(d1).realize()
sync()
# cross copy. this is (sometimes) going through the CPU
with Timing("0 -> 1: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
a1 = a0.to(d1).realize()
sync()
with Timing("1 -> 0: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
b0 = b1.to(d0).realize()
sync()
# sum
with Timing("0+0 -> 0 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
ab0 = (a0 + b0).realize()
sync()
with Timing("1+1 -> 1 (sum): ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
ab1 = (a1 + b1).realize()
sync()
# cross device sum (does this work?)
with Timing(colored("0+1 -> 0 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
abx0 = (a0 + b1.to(d0)).realize()
sync()
with Timing(colored("1+0 -> 1 (sum): ", "red"), on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
abx1 = (b1 + a0.to(d1)).realize()
sync()
# copy back
# NOTE: half of this slowness is caused by allocating memory on the CPU
with Timing("0 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
cc0 = ab0.numpy()
with Timing("1 -> CPU: ", on_exit=lambda x: f", {(sz*4)/x:.2f} GB/sec"):
cc1 = ab1.numpy()
# same
print("testing")
np.testing.assert_allclose(cc0, cc1)
# same (cross)
print("testing (cross)")
np.testing.assert_allclose(cc0, abx0.numpy())
np.testing.assert_allclose(cc0, abx1.numpy())
# devices
print(ab0)
print(ab1)
print(abx0)
print(abx1)