#!POPCORN leaderboard grayscale
#!POPCORN gpu A100
# not a stable API, but works
import torch
from tinygrad import Tensor , TinyJit , Device
from tinygrad . helpers import Context , OSX
from tinygrad . dtype import _from_torch_dtype
@TinyJit
def f ( tg_out , tg_data ) : return tg_out . assign ( tg_data [ : , : , 0 ] * 0.2989 + tg_data [ : , : , 1 ] * 0.5870 + tg_data [ : , : , 2 ] * 0.1140 ) . realize ( )
def custom_kernel ( data : torch . Tensor , device = " CUDA " ) - > torch . Tensor :
assert data . dtype == torch . float32
tg_data = Tensor . from_blob ( data . data_ptr ( ) , data . shape , dtype = _from_torch_dtype ( data . dtype ) , device = device )
out = torch . empty ( ( data . shape [ 0 ] , data . shape [ 1 ] ) , dtype = data . dtype , device = data . device )
tg_out = Tensor . from_blob ( out . data_ptr ( ) , out . shape , dtype = _from_torch_dtype ( out . dtype ) , device = device )
# Need to sync torch to make sure the data is valid.
if data . device . type == " mps " : torch . mps . synchronize ( )
else : torch . cuda . synchronize ( )
with Context ( BEAM = 2 ) : f ( tg_out , tg_data )
# Wait for computation to finish and the data is valid.
Device [ device ] . synchronize ( )
return out
if __name__ == " __main__ " :
for i in range ( 3 ) :
if OSX :
out = custom_kernel ( inp := torch . rand ( 16 , 16 , 3 , device = torch . device ( " mps " ) ) , device = " METAL " )
else :
out = custom_kernel ( inp := torch . rand ( 16 , 16 , 3 , device = torch . device ( " cuda " ) ) , device = " CUDA " )
assert torch . allclose ( out , inp [ : , : , 0 ] * 0.2989 + inp [ : , : , 1 ] * 0.5870 + inp [ : , : , 2 ] * 0.1140 )