openpilot_comma/tinygrad_repo/examples/llm.c/train_gpt2.py

#!/usr/bin/env python3
import os, math, time
import numpy as np
from tinygrad import Tensor, nn, fetch, Device, TinyJit, GlobalCounters
from dataclasses import dataclass

@dataclass
class GPTConfig:
  block_size: int = 1024
  vocab_size: int = 50257
  padded_vocab_size: int = 50304
  n_layer: int = 12
  n_head: int = 12
  n_embd: int = 768

class CausalSelfAttention:
  def __init__(self, config:GPTConfig):
    assert config.n_embd % config.n_head == 0
    # key, query, value projections for all heads, but in a batch
    self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
    # output projection
    self.c_proj = nn.Linear(config.n_embd, config.n_embd)
    # regularization
    self.n_head = config.n_head
    self.n_embd = config.n_embd
    # not really a 'bias', more of a mask, but following the OpenAI/HF naming though
    self.bias = Tensor.ones(1, 1, config.block_size, config.block_size).tril()
    self.bias.requires_grad = False

  def __call__(self, x:Tensor):
    B, T, C = x.shape
    qkv = self.c_attn(x)
    q, k, v = qkv.split(self.n_embd, dim=2)
    k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
    q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
    v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)

    # manual implementation of attention
    att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
    att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
    att = att.softmax()
    y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
    y = y.transpose(1, 2).view(B, T, C) # re-assemble all head outputs side by side
    # output projection
    y = self.c_proj(y)
    return y

class MLP:
  def __init__(self, config:GPTConfig):
    self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
    self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)

  def __call__(self, x:Tensor) -> Tensor:
    return self.c_proj(self.c_fc(x).gelu())

class Block:
  def __init__(self, config:GPTConfig):
    self.ln_1 = nn.LayerNorm(config.n_embd)
    self.attn = CausalSelfAttention(config)
    self.ln_2 = nn.LayerNorm(config.n_embd)
    self.mlp = MLP(config)

  def __call__(self, x:Tensor):
    x = x + self.attn(self.ln_1(x))
    x = x + self.mlp(self.ln_2(x))
    return x

class GPT:
  def __init__(self, config:GPTConfig):
    self.config = config

    self.wte = nn.Embedding(config.padded_vocab_size, config.n_embd)
    self.wpe = nn.Embedding(config.block_size, config.n_embd)
    self.h = [Block(config) for _ in range(config.n_layer)]
    self.ln_f = nn.LayerNorm(config.n_embd)
    self.lm_head = nn.Linear(config.n_embd, config.padded_vocab_size, bias=False)
    self.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying

  def load_pretrained(self):
    weights = nn.state.torch_load(fetch(f'https://huggingface.co/gpt2/resolve/main/pytorch_model.bin'))
    transposed = ('attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight')
    for k in weights:
      if k == "wte.weight":
        weights[k] = weights[k].pad(((0, self.config.padded_vocab_size-self.config.vocab_size), (0,0))).to(None).contiguous()
      if k.endswith(transposed):
        weights[k] = weights[k].to(None).T.contiguous()
    # lm head and wte are tied
    weights['lm_head.weight'] = weights['wte.weight']
    nn.state.load_state_dict(self, weights)

  def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
    for _ in range(max_new_tokens):
      idx_cond = idx if idx.shape[1] <= self.config.block_size else idx[:, -self.config.block_size:]
      logits, _ = self(idx_cond)
      logits = logits[:, -1, :] / temperature
      idx_next = logits.softmax().multinomial()
      idx = Tensor.cat(idx, idx_next, dim=1)
    return idx

  def __call__(self, idx:Tensor, targets=None):
    b, t = idx.shape
    pos = Tensor.arange(0, t, device=idx.device)

    tok_emb = self.wte(idx) # token embeddings of shape (b, t, n_embd)
    pos_emb = self.wpe(pos) # position embeddings of shape (t, n_embd)
    x = tok_emb + pos_emb

    x = self.ln_f(x.sequential(self.h))

    if targets is not None:
      logits = self.lm_head(x)[:, :, :self.config.vocab_size]
      loss = logits.sparse_categorical_crossentropy(targets)
    else:
      logits = self.lm_head(x[:, [-1], :])[:, :, :self.config.vocab_size]
      loss = None

    return logits, loss

if __name__ == "__main__":
  import tiktoken, argparse

  parser = argparse.ArgumentParser()
  parser.add_argument("--num_iterations", type=int, default=10, help="number of iterations to run")
  parser.add_argument("--batch_size", type=int, default=4, help="batch size")
  parser.add_argument("--sequence_length", type=int, default=64, help="sequence length")
  parser.add_argument("--skip_test", action="store_true", help="skip test")
  parser.add_argument("--gpus", type=int, default=1, help="sequence length")
  args = parser.parse_args()
  B, T = args.batch_size, args.sequence_length
  assert 1 <= T <= 1024

  model = GPT(GPTConfig(n_layer=12, n_head=12, n_embd=768))
  model.load_pretrained()

  if args.gpus > 1:
    GPUS = tuple(f'{Device.DEFAULT}:{i}' for i in range(args.gpus))
    for x in nn.state.get_parameters(model): x.to_(GPUS)  # we put a copy of the model on every GPU

  # init the tokenizer
  enc = tiktoken.get_encoding("gpt2")
  encode = lambda s: enc.encode(s, allowed_special={"<|endoftext|>"})
  decode = lambda l: enc.decode(l)

  # load the tokens
  # prefer to use tiny_shakespeare if it's available, otherwise use tiny_stories
  # we're using val instead of train split just because it is smaller/faster
  tokens_bin = fetch("https://huggingface.co/datasets/karpathy/llmc-starter-pack/resolve/main/tiny_shakespeare_val.bin")
  assert os.path.isfile(tokens_bin)
  print(f"loading cached tokens in {tokens_bin}")
  with open(tokens_bin, "rb") as f:
    f.seek(0x400)
    tokens = np.frombuffer(f.read(), dtype=np.uint16).astype(np.int32)
  tokens = Tensor(tokens)

  # lightweight dataloader
  def get_batch():
    assert B*T+1 <= len(tokens), "not enough tokens"
    # for 338,025 tokens. E.g. with B=8 T=1024, this will yield 41 batches before looping
    i = 0
    while True:
      x = tokens[i:i+B*T].view(B, T)
      y = tokens[i+1:i+B*T+1].view(B, T)
      yield x, y
      i += B*T
      if i + B*T + 1 >= len(tokens):
        i = 0 # in prod we'd want to randomize the start point a bit

  # forward backward for a few iterations
  data_iter = iter(get_batch())
  x, y = next(data_iter) # we'll overfit this batch below
  optimizer = nn.optim.AdamW(nn.state.get_parameters(model), lr=1e-4, weight_decay=0)

  print(f"model state:     {sum(x.nbytes() for x in nn.state.get_parameters(model))/1e9:.2f} GB")
  print(f"optimizer state: {sum(x.nbytes() for x in nn.state.get_parameters(optimizer))/1e9:.2f} GB")

  # shard the data on axis 0
  if args.gpus > 1: x, y = x.shard(GPUS, axis=0), y.shard(GPUS, axis=0)

  @TinyJit
  @Tensor.train()
  def step(x:Tensor, y:Tensor) -> Tensor:
    _, loss = model(x, y)
    optimizer.zero_grad()
    loss.backward()
    return loss.realize(*optimizer.schedule_step())

  for i in range(args.num_iterations):
    GlobalCounters.reset()
    t0 = time.perf_counter()
    loss = step(x.contiguous(), y.contiguous())
    Device[Device.DEFAULT].synchronize()
    t1 = time.perf_counter()
    print(f"iteration {i}, loss: {loss.item():.6f}, time: {(t1-t0)*1000:.3f}ms, {int(B*T/(t1-t0))} tok/s, {GlobalCounters.global_mem/1e9:.2f} GB")

  if not args.skip_test:
    # copy back to single gpu for test
    if args.gpus > 1:
      for x in nn.state.get_parameters(model): x.to_(Device.DEFAULT)
    start = "<|endoftext|>"
    start_ids = encode(start)
    x = (Tensor(start_ids)[None, ...])
    max_new_tokens = 16
    temperature = 1.0
    top_k = 40
    y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
    print(decode(y[0].tolist()))
openpilot v0.9.8 release date: 2025-03-15T21:10:51 master commit: fb7b9c0f9420d228f03362970ebcfb7237095cf3 2 months ago			`#!/usr/bin/env python3`
			`import os, math, time`
			`import numpy as np`
			`from tinygrad import Tensor, nn, fetch, Device, TinyJit, GlobalCounters`
			`from dataclasses import dataclass`

			`@dataclass`
			`class GPTConfig:`
			`block_size: int = 1024`
			`vocab_size: int = 50257`
			`padded_vocab_size: int = 50304`
			`n_layer: int = 12`
			`n_head: int = 12`
			`n_embd: int = 768`

			`class CausalSelfAttention:`
			`def __init__(self, config:GPTConfig):`
			`assert config.n_embd % config.n_head == 0`
			`# key, query, value projections for all heads, but in a batch`
			`self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)`
			`# output projection`
			`self.c_proj = nn.Linear(config.n_embd, config.n_embd)`
			`# regularization`
			`self.n_head = config.n_head`
			`self.n_embd = config.n_embd`
			`# not really a 'bias', more of a mask, but following the OpenAI/HF naming though`
			`self.bias = Tensor.ones(1, 1, config.block_size, config.block_size).tril()`
			`self.bias.requires_grad = False`

			`def __call__(self, x:Tensor):`
			`B, T, C = x.shape`
			`qkv = self.c_attn(x)`
			`q, k, v = qkv.split(self.n_embd, dim=2)`
			`k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)`
			`q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)`
			`v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)`

			`# manual implementation of attention`
			`att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))`
			`att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))`
			`att = att.softmax()`
			`y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)`
			`y = y.transpose(1, 2).view(B, T, C) # re-assemble all head outputs side by side`
			`# output projection`
			`y = self.c_proj(y)`
			`return y`

			`class MLP:`
			`def __init__(self, config:GPTConfig):`
			`self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)`
			`self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)`

			`def __call__(self, x:Tensor) -> Tensor:`
			`return self.c_proj(self.c_fc(x).gelu())`

			`class Block:`
			`def __init__(self, config:GPTConfig):`
			`self.ln_1 = nn.LayerNorm(config.n_embd)`
			`self.attn = CausalSelfAttention(config)`
			`self.ln_2 = nn.LayerNorm(config.n_embd)`
			`self.mlp = MLP(config)`

			`def __call__(self, x:Tensor):`
			`x = x + self.attn(self.ln_1(x))`
			`x = x + self.mlp(self.ln_2(x))`
			`return x`

			`class GPT:`
			`def __init__(self, config:GPTConfig):`
			`self.config = config`

			`self.wte = nn.Embedding(config.padded_vocab_size, config.n_embd)`
			`self.wpe = nn.Embedding(config.block_size, config.n_embd)`
			`self.h = [Block(config) for _ in range(config.n_layer)]`
			`self.ln_f = nn.LayerNorm(config.n_embd)`
			`self.lm_head = nn.Linear(config.n_embd, config.padded_vocab_size, bias=False)`
			`self.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying`

			`def load_pretrained(self):`
			`weights = nn.state.torch_load(fetch(f'https://huggingface.co/gpt2/resolve/main/pytorch_model.bin'))`
			`transposed = ('attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight')`
			`for k in weights:`
			`if k == "wte.weight":`
			`weights[k] = weights[k].pad(((0, self.config.padded_vocab_size-self.config.vocab_size), (0,0))).to(None).contiguous()`
			`if k.endswith(transposed):`
			`weights[k] = weights[k].to(None).T.contiguous()`
			`# lm head and wte are tied`
			`weights['lm_head.weight'] = weights['wte.weight']`
			`nn.state.load_state_dict(self, weights)`

			`def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):`
			`for _ in range(max_new_tokens):`
			`idx_cond = idx if idx.shape[1] <= self.config.block_size else idx[:, -self.config.block_size:]`
			`logits, _ = self(idx_cond)`
			`logits = logits[:, -1, :] / temperature`
			`idx_next = logits.softmax().multinomial()`
			`idx = Tensor.cat(idx, idx_next, dim=1)`
			`return idx`

			`def __call__(self, idx:Tensor, targets=None):`
			`b, t = idx.shape`
openpilot v0.9.9 release date: 2025-05-20T09:09:53 master commit: 8752071049e94a8fc8ff5d086f24071b1210f788 2 days ago			`pos = Tensor.arange(0, t, device=idx.device)`
openpilot v0.9.8 release date: 2025-03-15T21:10:51 master commit: fb7b9c0f9420d228f03362970ebcfb7237095cf3 2 months ago
			`tok_emb = self.wte(idx) # token embeddings of shape (b, t, n_embd)`
			`pos_emb = self.wpe(pos) # position embeddings of shape (t, n_embd)`
			`x = tok_emb + pos_emb`

			`x = self.ln_f(x.sequential(self.h))`

			`if targets is not None:`
			`logits = self.lm_head(x)[:, :, :self.config.vocab_size]`
			`loss = logits.sparse_categorical_crossentropy(targets)`
			`else:`
			`logits = self.lm_head(x[:, [-1], :])[:, :, :self.config.vocab_size]`
			`loss = None`

			`return logits, loss`

			`if __name__ == "__main__":`
			`import tiktoken, argparse`

			`parser = argparse.ArgumentParser()`
			`parser.add_argument("--num_iterations", type=int, default=10, help="number of iterations to run")`
			`parser.add_argument("--batch_size", type=int, default=4, help="batch size")`
			`parser.add_argument("--sequence_length", type=int, default=64, help="sequence length")`
			`parser.add_argument("--skip_test", action="store_true", help="skip test")`
openpilot v0.9.9 release date: 2025-05-20T09:09:53 master commit: 8752071049e94a8fc8ff5d086f24071b1210f788 2 days ago			`parser.add_argument("--gpus", type=int, default=1, help="sequence length")`
openpilot v0.9.8 release date: 2025-03-15T21:10:51 master commit: fb7b9c0f9420d228f03362970ebcfb7237095cf3 2 months ago			`args = parser.parse_args()`
			`B, T = args.batch_size, args.sequence_length`
			`assert 1 <= T <= 1024`

			`model = GPT(GPTConfig(n_layer=12, n_head=12, n_embd=768))`
			`model.load_pretrained()`

openpilot v0.9.9 release date: 2025-05-20T09:09:53 master commit: 8752071049e94a8fc8ff5d086f24071b1210f788 2 days ago			`if args.gpus > 1:`
			`GPUS = tuple(f'{Device.DEFAULT}:{i}' for i in range(args.gpus))`
			`for x in nn.state.get_parameters(model): x.to_(GPUS) # we put a copy of the model on every GPU`

openpilot v0.9.8 release date: 2025-03-15T21:10:51 master commit: fb7b9c0f9420d228f03362970ebcfb7237095cf3 2 months ago			`# init the tokenizer`
			`enc = tiktoken.get_encoding("gpt2")`
			`encode = lambda s: enc.encode(s, allowed_special={"<\|endoftext\|>"})`
			`decode = lambda l: enc.decode(l)`

			`# load the tokens`
			`# prefer to use tiny_shakespeare if it's available, otherwise use tiny_stories`
			`# we're using val instead of train split just because it is smaller/faster`
			`tokens_bin = fetch("https://huggingface.co/datasets/karpathy/llmc-starter-pack/resolve/main/tiny_shakespeare_val.bin")`
			`assert os.path.isfile(tokens_bin)`
			`print(f"loading cached tokens in {tokens_bin}")`
			`with open(tokens_bin, "rb") as f:`
			`f.seek(0x400)`
			`tokens = np.frombuffer(f.read(), dtype=np.uint16).astype(np.int32)`
			`tokens = Tensor(tokens)`

			`# lightweight dataloader`
			`def get_batch():`
			`assert B*T+1 <= len(tokens), "not enough tokens"`
			`# for 338,025 tokens. E.g. with B=8 T=1024, this will yield 41 batches before looping`
			`i = 0`
			`while True:`
			`x = tokens[i:i+B*T].view(B, T)`
			`y = tokens[i+1:i+B*T+1].view(B, T)`
			`yield x, y`
			`i += B*T`
			`if i + B*T + 1 >= len(tokens):`
			`i = 0 # in prod we'd want to randomize the start point a bit`

			`# forward backward for a few iterations`
			`data_iter = iter(get_batch())`
			`x, y = next(data_iter) # we'll overfit this batch below`
			`optimizer = nn.optim.AdamW(nn.state.get_parameters(model), lr=1e-4, weight_decay=0)`

openpilot v0.9.9 release date: 2025-05-20T09:09:53 master commit: 8752071049e94a8fc8ff5d086f24071b1210f788 2 days ago			`print(f"model state: {sum(x.nbytes() for x in nn.state.get_parameters(model))/1e9:.2f} GB")`
			`print(f"optimizer state: {sum(x.nbytes() for x in nn.state.get_parameters(optimizer))/1e9:.2f} GB")`

			`# shard the data on axis 0`
			`if args.gpus > 1: x, y = x.shard(GPUS, axis=0), y.shard(GPUS, axis=0)`

openpilot v0.9.8 release date: 2025-03-15T21:10:51 master commit: fb7b9c0f9420d228f03362970ebcfb7237095cf3 2 months ago			`@TinyJit`
openpilot v0.9.9 release date: 2025-05-20T09:09:53 master commit: 8752071049e94a8fc8ff5d086f24071b1210f788 2 days ago			`@Tensor.train()`
			`def step(x:Tensor, y:Tensor) -> Tensor:`
openpilot v0.9.8 release date: 2025-03-15T21:10:51 master commit: fb7b9c0f9420d228f03362970ebcfb7237095cf3 2 months ago			`_, loss = model(x, y)`
			`optimizer.zero_grad()`
			`loss.backward()`
			`return loss.realize(*optimizer.schedule_step())`

openpilot v0.9.9 release date: 2025-05-20T09:09:53 master commit: 8752071049e94a8fc8ff5d086f24071b1210f788 2 days ago			`for i in range(args.num_iterations):`
			`GlobalCounters.reset()`
			`t0 = time.perf_counter()`
			`loss = step(x.contiguous(), y.contiguous())`
			`Device[Device.DEFAULT].synchronize()`
			`t1 = time.perf_counter()`
			`print(f"iteration {i}, loss: {loss.item():.6f}, time: {(t1-t0)1000:.3f}ms, {int(BT/(t1-t0))} tok/s, {GlobalCounters.global_mem/1e9:.2f} GB")`
openpilot v0.9.8 release date: 2025-03-15T21:10:51 master commit: fb7b9c0f9420d228f03362970ebcfb7237095cf3 2 months ago
			`if not args.skip_test:`
openpilot v0.9.9 release date: 2025-05-20T09:09:53 master commit: 8752071049e94a8fc8ff5d086f24071b1210f788 2 days ago			`# copy back to single gpu for test`
			`if args.gpus > 1:`
			`for x in nn.state.get_parameters(model): x.to_(Device.DEFAULT)`
openpilot v0.9.8 release date: 2025-03-15T21:10:51 master commit: fb7b9c0f9420d228f03362970ebcfb7237095cf3 2 months ago			`start = "<\|endoftext\|>"`
			`start_ids = encode(start)`
			`x = (Tensor(start_ids)[None, ...])`
			`max_new_tokens = 16`
			`temperature = 1.0`
			`top_k = 40`
			`y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)`
			`print(decode(y[0].tolist()))`