#!/usr/bin/env python3
import os, time, io, pathlib, sys, traceback
sys.path.insert(0, str(pathlib.Path(__file__).parent.parent))

if os.getenv("OPT", None) is None:
  os.environ['OPT'] = '99'
if os.getenv("GPU", None) is None:
  os.environ['GPU'] = '1'
if os.getenv("IMAGE", None) is None:
  os.environ['IMAGE'] = '2'

from tinygrad.helpers import getenv
ALLOWED_KERNEL_COUNT = getenv("ALLOWED_KERNEL_COUNT", 0)
DEBUGCL = getenv("DEBUGCL", 0)

import onnx
import numpy as np

import tinygrad.graph as graph
from tinygrad.ops import GlobalCounters

import pyopencl as cl
from tinygrad.runtime.ops_gpu import CL
from extra.utils import fetch
from extra.onnx import get_run_onnx
from tinygrad.tensor import Tensor

OPENPILOT_MODEL = "https://github.com/commaai/openpilot/raw/6c5693e965b9c63f8678f52b9e9b5abe35f23feb/selfdrive/modeld/models/supercombo.onnx"

np.random.seed(1337)
def get_random_input_tensors(input_shapes):
  # this 16 is a random scale factor
  inputs = {k:Tensor.randn(*shp, requires_grad=False)*8 for k,shp in input_shapes.items()}
  np_inputs = {k:v.realize().numpy() for k,v in inputs.items()}
  return inputs, np_inputs

from tinygrad.jit import TinyJit

@TinyJit
def model_exec(run_onnx, using_graph, **inputs):
  ret = next(iter(run_onnx(inputs).values()))
  GlobalCounters.reset()
  GlobalCounters.cache = []  # don't cache pre-realize
  if using_graph: graph.GRAPH = True
  print("realizing")
  return ret.realize()

def compile(dat, output_fn):
  Tensor.manual_seed(1337)
  Tensor.no_grad = True
  using_graph = graph.GRAPH
  graph.GRAPH = False

  onnx_model = onnx.load(io.BytesIO(dat))
  run_onnx = get_run_onnx(onnx_model)
  input_shapes = {inp.name:tuple(x.dim_value for x in inp.type.tensor_type.shape.dim) for inp in onnx_model.graph.input}

  inputs, np_inputs = get_random_input_tensors(input_shapes)
  # run twice to trigger the JIT
  for i in range(2): tinygrad_out = model_exec(run_onnx, i == 1 and using_graph, **inputs)
  graph.GRAPH = False
  print("kernel count:", len(model_exec.jit_cache))
  assert len(model_exec.jit_cache) <= ALLOWED_KERNEL_COUNT or ALLOWED_KERNEL_COUNT == 0, "too many kernels!"

  # pull out inputs and put them in the jit cache
  input_rawbuffers = {k:inputs[k].lazydata.realized.raw() for k in inputs.keys()}
  for (j,i),idx in model_exec.input_replace.items(): model_exec.jit_cache[j][1][i] = input_rawbuffers[idx]

  # transform to CL.CACHE
  used_ops = 0
  cl_cache = []
  for prg,args in model_exec.jit_cache:
    # pass these to thneed
    setattr(prg.clprg, 'op_estimate', prg.op_estimate)
    setattr(prg.clprg, 'prg', prg.prg)
    cl_cache.append((prg.clprg, [prg.global_size, prg.local_size, *[x._cl for x in args]]))
    used_ops += prg.op_estimate

  from extra.thneed import Thneed
  t = Thneed(cl_cache, {k:v._cl for k,v in input_rawbuffers.items()})

  # save thneed (before run)
  t.save(output_fn)

  print(f"buffers to save: {len(t.buffers_to_save)}, inputs: {list(t.inputs.keys())}, outputs: {t.outputs}")
  runtime = t.run()
  print(f"network using {used_ops/1e9:.2f} GOPS with runtime {runtime*1e3:.2f} ms that's {used_ops/runtime*1e-9:.2f} GFLOPS")

  # confirm thneed found the right output
  thneed_out = np.empty((t.outputs[0].size//4,), dtype=np.float32).reshape(tinygrad_out.shape)
  cl.enqueue_copy(CL.cl_queue, thneed_out, t.outputs[0], is_blocking=True)
  np.testing.assert_allclose(thneed_out, tinygrad_out.numpy())

  # testing is float32 only (fix this)
  FLOAT16 = getenv("FLOAT16", 0)
  if FLOAT16 == 0:
    try:
      from test.models.test_onnx import run_onnx_torch
      torch_out = run_onnx_torch(onnx_model, np_inputs).numpy()
      print(thneed_out, torch_out, "mse", np.sum((thneed_out-torch_out)**2), "max err", np.max(np.abs((thneed_out-torch_out))))
      np.testing.assert_allclose(torch_out, thneed_out, atol=1e-4, rtol=1e-2)

      # test loading/run thneed
      _, new_np_inputs = get_random_input_tensors(input_shapes)
      new_torch_out = run_onnx_torch(onnx_model, new_np_inputs).numpy()

      # try old thneed with a different input
      for k,v in t.inputs.items():
        cl.enqueue_copy(CL.cl_queue, v, new_np_inputs[k], is_blocking=True)

      t.run()
      old_thneed_out = np.empty((t.outputs[0].size//4,), dtype=np.float32).reshape(tinygrad_out.shape)
      cl.enqueue_copy(CL.cl_queue, old_thneed_out, t.outputs[0], is_blocking=True)

      # compare thneed (rerun) with torch
      np.testing.assert_allclose(new_torch_out, old_thneed_out, atol=1e-4, rtol=1e-2)

      # load thneed and try that
      _, new_np_inputs = get_random_input_tensors(input_shapes)
      new_torch_out = run_onnx_torch(onnx_model, new_np_inputs).numpy()
      nt = Thneed()
      nt.load(output_fn)

      # inputs
      for k,v in nt.inputs.items():
        cl.enqueue_copy(CL.cl_queue, v, new_np_inputs[k], is_blocking=True)

      nt.run()
      new_thneed_out = np.empty((nt.outputs[0].size//4,), dtype=np.float32).reshape(tinygrad_out.shape)
      cl.enqueue_copy(CL.cl_queue, new_thneed_out, nt.outputs[0], is_blocking=True)

      # compare torch to thneed
      np.testing.assert_allclose(new_torch_out, new_thneed_out, atol=1e-4, rtol=1e-2)
      print("thneed self-test passed!")
    except ModuleNotFoundError as e:
      print(f"TEST NOT HAPPENING {e}")


# UNSAFE_FLOAT4=1 DEBUGCL=1 FLOAT16=1 python3 openpilot/compile.py
# 22.59 ms
if __name__ == "__main__":
  if len(sys.argv) >= 3:
    with open(sys.argv[1], "rb") as f:
      dat = f.read()
    compile(dat, sys.argv[2])
  else:
    dat = fetch(OPENPILOT_MODEL)
    compile(dat, "/tmp/output.thneed")