import os , pathlib , argparse
from examples . llama3 import Tokenizer
from tabulate import tabulate
from tinygrad import fetch
from tinygrad . helpers import flatten
# llama 3 tokenizer
tokenizer = Tokenizer ( fetch ( " https://huggingface.co/bofenghuang/Meta-Llama-3-8B/resolve/main/original/tokenizer.model " ) . as_posix ( ) )
def read_code ( base_path ) :
ret = [ ]
for path , _ , files in os . walk ( os . path . join ( base_path , " tinygrad " ) ) :
for name in files :
if not name . endswith ( " .py " ) : continue
if ' tinygrad/runtime/autogen ' in path . replace ( ' \\ ' , ' / ' ) : continue
fullpath = os . path . join ( path , name )
code = pathlib . Path ( fullpath ) . read_text ( )
ret . append ( ( " ### " + fullpath . split ( " tinygrad/ " , 1 ) [ 1 ] , code ) )
return ret
def write_code_to_file ( filename , code_list ) :
""" Writes the combined code to a specified file. """
with open ( filename , ' w ' ) as f :
f . write ( ' \n ' . join ( flatten ( code_list ) ) )
if __name__ == " __main__ " :
parser = argparse . ArgumentParser ( description = " Analyze and optionally save tinygrad code. " )
parser . add_argument ( " --output " , help = " Output file to write the combined code to. " )
args = parser . parse_args ( )
ret = read_code ( " . " )
table = [ ]
for name , code in ret :
table . append ( [ name , len ( tokenizer . encode ( name + " \x00 " + code ) ) ] )
print ( tabulate ( [ [ " name " , " llm tokens " ] ] + sorted ( table , key = lambda x : - x [ 1 ] ) , headers = " firstrow " ) )
code_str = ' \x00 ' . join ( flatten ( ret ) )
print ( f " code has { len ( code_str ) } chars " )
newline_count = code_str . count ( ' \n ' )
print ( f " code has { newline_count } newlines " )
encoded = tokenizer . encode ( code_str )
print ( f " code has { len ( encoded ) } tokens " )
if args . output :
write_code_to_file ( args . output , ret )
print ( f " Combined code written to { args . output } " )