You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
36 lines
1.2 KiB
36 lines
1.2 KiB
1 month ago
|
import os, pathlib
|
||
|
from examples.llama3 import Tokenizer
|
||
|
from tabulate import tabulate
|
||
|
from tinygrad import fetch
|
||
|
from tinygrad.helpers import flatten
|
||
|
|
||
|
# llama 3 tokenizer
|
||
|
tokenizer = Tokenizer(fetch("https://huggingface.co/bofenghuang/Meta-Llama-3-8B/resolve/main/original/tokenizer.model").as_posix())
|
||
|
|
||
|
def read_code(base_path):
|
||
|
ret = []
|
||
|
for path, _, files in os.walk(os.path.join(base_path, "tinygrad")):
|
||
|
for name in files:
|
||
|
if not name.endswith(".py"): continue
|
||
|
if 'tinygrad/runtime/autogen' in path.replace('\\', '/'): continue
|
||
|
fullpath = os.path.join(path, name)
|
||
|
code = pathlib.Path(fullpath).read_text()
|
||
|
ret += [(fullpath.split("tinygrad/", 1)[1], code)]
|
||
|
return ret
|
||
|
|
||
|
if __name__ == "__main__":
|
||
|
ret = read_code(".")
|
||
|
|
||
|
table = []
|
||
|
for name,code in ret:
|
||
|
table.append([name, len(tokenizer.encode(name+"\x00"+code))])
|
||
|
print(tabulate([["name", "llm tokens"]]+sorted(table, key=lambda x: -x[1]), headers="firstrow"))
|
||
|
|
||
|
code_str = '\x00'.join(flatten(ret))
|
||
|
print(f"code has {len(code_str)} chars")
|
||
|
newline_count = code_str.count('\n')
|
||
|
print(f"code has {newline_count} newlines")
|
||
|
|
||
|
encoded = tokenizer.encode(code_str)
|
||
|
print(f"code has {len(encoded)} tokens")
|