import subprocess
import numpy as np
import torch
import unittest, copy, mmap, random, math, array
from tinygrad import Tensor, Device, dtypes
from tinygrad.helpers import getenv, temp, _METADATA, mv_address
from extra.gradcheck import numerical_jacobian, jacobian, gradcheck
from hypothesis import given, settings, strategies as strat
from tinygrad.device import is_dtype_supported
from tinygrad.ops import Ops, UOp
from tinygrad.runtime.support.compiler_cuda import PTX
from tinygrad.codegen.linearize import linearize_uop
from tinygrad.codegen.devectorizer import full_graph_rewrite
from tinygrad.codegen.lowerer import rewrite_shapetracker_with_index
from tinygrad.dtype import DType

settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
settings.load_profile("my_profile")

x_init = np.random.randn(1,3).astype(np.float32)
U_init = np.random.randn(3,3).astype(np.float32)
V_init = np.random.randn(3,3).astype(np.float32)
W_init = np.random.randn(3,3).astype(np.float32)
m_init = np.random.randn(1,3).astype(np.float32)
gradient = np.random.randn(1,3).astype(np.float32)

class TestTinygrad(unittest.TestCase):
  def test_zerodim_initialization(self):
    self.assertEqual(Tensor(55).shape, ())
    self.assertEqual(Tensor(3.14).shape, ())

  def test_plus_equals(self):
    a = Tensor.randn(10,10)
    b = Tensor.randn(10,10)
    c = a + b
    val1 = c.numpy()
    a += b
    val2 = a.numpy()
    np.testing.assert_allclose(val1, val2)

  def test_backward_pass(self):
    def test_tinygrad():
      x = Tensor(x_init, requires_grad=True)
      W = Tensor(W_init, requires_grad=True)
      m = Tensor(m_init)
      out = x.dot(W).relu()
      out = out.log_softmax()
      out = out.mul(m).add(m).sum()
      out.backward()
      return out.numpy(), x.grad.numpy(), W.grad.numpy()

    def test_pytorch():
      x = torch.tensor(x_init, requires_grad=True)
      W = torch.tensor(W_init, requires_grad=True)
      m = torch.tensor(m_init)
      out = x.matmul(W).relu()
      out = torch.nn.functional.log_softmax(out, dim=1)
      out = out.mul(m).add(m).sum()
      out.backward()
      return out.detach().numpy(), x.grad, W.grad

    for x,y in zip(test_tinygrad(), test_pytorch()):
      np.testing.assert_allclose(x, y, atol=1e-5)

  # A simple test is to check that we can accumulate gradients (run backward twice or more times)
  def test_accumulate_gradients(self):
    x = Tensor(x_init, requires_grad=True)
    W = Tensor(W_init, requires_grad=True)
    m = Tensor(m_init)
    out = x.dot(W).relu()
    out = out.log_softmax()
    out = out.mul(m).add(m).sum()
    out.backward()
    xgrad,wgrad = x.grad, W.grad
    out.backward()
    xgrad2,wgrad2 = x.grad, W.grad
    out.backward() # no need to retain again since we will not re-run backward
    xgrad3,wgrad3 = x.grad, W.grad
    np.testing.assert_allclose(xgrad3.numpy(), xgrad.numpy() * 3., atol=1e-6)
    np.testing.assert_allclose(wgrad3.numpy(), wgrad.numpy() * 3., atol=1e-6)
    np.testing.assert_allclose(xgrad2.numpy(), xgrad.numpy() * 2., atol=1e-6)
    np.testing.assert_allclose(wgrad2.numpy(), wgrad.numpy() * 2., atol=1e-6)

  def test_second_order_backward_pass(self):
    def test_pytorch():
      x_val = torch.tensor([2.0], requires_grad=True)
      f = x_val**3
      first_derivative = torch.autograd.grad(outputs=f, inputs=x_val, create_graph=True)[0]
      second_derivative = torch.autograd.grad(outputs=first_derivative, inputs=x_val)[0]
      # d^2f/dx^2 = 6x = 6*2 = 12
      return second_derivative.numpy()

    def test_tinygrad():
      x_val = Tensor(2.0)
      f = x_val**3
      first_derivative = f.gradient(x_val)[0]
      second_derivative = first_derivative.gradient(x_val)[0]
      return second_derivative.numpy()

    np.testing.assert_allclose(test_tinygrad(), test_pytorch(), atol=1e-5)

  # passing `gradient` to backward
  def test_backward_pass_vjp(self):
    def test_tinygrad():
      x = Tensor(x_init, requires_grad=True)
      W = Tensor(W_init, requires_grad=True)
      m = Tensor(m_init)
      out = x.dot(W).relu()
      out = out.log_softmax()
      out = out.mul(m).add(m)
      out.backward(Tensor(gradient))
      return out.numpy(), x.grad.numpy(), W.grad.numpy()

    def test_pytorch():
      x = torch.tensor(x_init, requires_grad=True)
      W = torch.tensor(W_init, requires_grad=True)
      m = torch.tensor(m_init)
      out = x.matmul(W).relu()
      out = torch.nn.functional.log_softmax(out, dim=1)
      out = out.mul(m).add(m)
      out.backward(torch.tensor(gradient))
      return out.detach().numpy(), x.grad, W.grad

    for x,y in zip(test_tinygrad(), test_pytorch()):
      np.testing.assert_allclose(x, y, atol=1e-5)

  def test_backward_pass_diamond_model(self):
    def test_tinygrad():
      u = Tensor(U_init, requires_grad=True)
      v = Tensor(V_init, requires_grad=True)
      w = Tensor(W_init, requires_grad=True)
      x = u.mul(v).relu()
      y = u.mul(w).relu()
      out = x.add(y).mul(y).relu()
      out = out.log_softmax()
      out = out.sum()
      out.backward()
      return out.numpy(), u.grad.numpy(), v.grad.numpy(), w.grad.numpy()

    def test_pytorch():
      u = torch.tensor(U_init, requires_grad=True)
      v = torch.tensor(V_init, requires_grad=True)
      w = torch.tensor(W_init, requires_grad=True)
      x = u.mul(v).relu()
      y = u.mul(w).relu()
      out = x.add(y).mul(y).relu()
      out = torch.nn.functional.log_softmax(out, dim=1)
      out = out.sum()
      out.backward()
      return out.detach().numpy(), u.grad, v.grad, w.grad

    for x,y in zip(test_tinygrad(), test_pytorch()):
      np.testing.assert_allclose(x, y, atol=1e-5, rtol=1e-6)

  @unittest.expectedFailure
  def test_const_backward_pass(self):
    init = 3.5

    def test_pytorch():
      w1 = torch.tensor(init, requires_grad=True)
      w2 = torch.tensor(init, requires_grad=True)
      out = w1.add(w2)
      out.backward()
      return w1.grad, w2.grad

    def test_tinygrad():
      w1 = Tensor(init, requires_grad=True)
      w2 = Tensor(init, requires_grad=True)
      out = w1.add(w2)
      out.backward()
      return w1.grad.numpy(), w2.grad.numpy()

    for x, y in zip(test_tinygrad(), test_pytorch()):
      np.testing.assert_allclose(x, y, atol=1e-5)

  def test_nograd(self):
    x = Tensor(x_init, requires_grad=False)
    m = Tensor(m_init, requires_grad=False)
    W = Tensor(W_init, requires_grad=True)
    tmp = x.mul(m)
    mm = tmp.matmul(W)
    out = mm.relu()
    out = out.sum()
    out.backward()
    assert x.grad is None
    assert m.grad is None
    assert tmp.grad is None
    assert mm.grad is not None
    assert W.grad is not None

  def test_dropout(self):
    with Tensor.train():
      n, rate = 1_000_000, 0.1
      w = Tensor.ones(n).dropout(rate)
      non_zeros = np.count_nonzero(w.numpy())
      expected = n * (1 - rate)
      np.testing.assert_allclose(non_zeros, expected, rtol=2e-3)

  def test_jacobian(self):
    W = np.random.RandomState(42069).random((10, 5)).astype(np.float32)
    x = np.random.RandomState(69420).random((1, 10)).astype(np.float32)

    torch_x = torch.tensor(x, requires_grad=True)
    torch_W = torch.tensor(W, requires_grad=True)
    def torch_func(x): return torch.nn.functional.log_softmax(x.matmul(torch_W).relu(), dim=1)
    PJ = torch.autograd.functional.jacobian(torch_func, torch_x).squeeze().numpy()

    tiny_x = Tensor(x, requires_grad=True)
    tiny_W = Tensor(W, requires_grad=True)
    def tiny_func(x): return x.dot(tiny_W).relu().log_softmax()
    J = jacobian(tiny_func, tiny_x)
    NJ = numerical_jacobian(tiny_func, tiny_x)

    np.testing.assert_allclose(PJ, J, atol = 1e-5)
    np.testing.assert_allclose(PJ, NJ, atol = 1e-3)

  def test_gradcheck(self):
    W = np.random.RandomState(1337).random((10, 5)).astype(np.float32)
    x = np.random.RandomState(7331).random((1, 10)).astype(np.float32)

    tiny_x = Tensor(x, requires_grad=True)
    tiny_W = Tensor(W, requires_grad=True)
    def tiny_func(x): return x.dot(tiny_W).relu().log_softmax()

    self.assertTrue(gradcheck(tiny_func, tiny_x, eps = 1e-3))

    # coarse approx. since a "big" eps and the non-linearities of the model
    self.assertFalse(gradcheck(tiny_func, tiny_x, eps = 1e-5))

  def test_random_fns_are_deterministic_with_seed(self):
    for random_fn in [Tensor.randn, Tensor.normal, Tensor.uniform, Tensor.scaled_uniform, Tensor.glorot_uniform, Tensor.kaiming_normal]:
      with self.subTest(msg=f"Tensor.{random_fn.__name__}"):
        Tensor.manual_seed(1337)
        a = random_fn(10,10).realize()
        Tensor.manual_seed(1337)
        b = random_fn(10,10).realize()
        np.testing.assert_allclose(a.numpy(), b.numpy())

  def test_randn_isnt_inf_on_zero(self):
    # simulate failure case of rand handing a zero to randn
    original_rand, Tensor.rand = Tensor.rand, Tensor.zeros
    try: self.assertNotIn(np.inf, Tensor.randn(16).numpy())
    except: raise
    finally: Tensor.rand = original_rand

  def test_zeros_like_has_same_dtype_and_shape(self):
    for datatype in [dtypes.float16, dtypes.float32, dtypes.int8, dtypes.int32, dtypes.int64, dtypes.uint8]:
      a = Tensor([1, 2, 3], dtype=datatype)
      b = Tensor.zeros_like(a)
      assert a.dtype == b.dtype, f"dtype mismatch {a.dtype=} != {b.dtype}"
      assert a.shape == b.shape, f"shape mismatch {a.shape} != {b.shape}"

    a = Tensor([1, 2, 3])
    b = Tensor.zeros_like(a, dtype=dtypes.int8)
    assert a.dtype == dtypes.default_int and b.dtype == dtypes.int8, "a.dtype should be int and b.dtype should be char"
    assert a.shape == b.shape, f"shape mismatch {a.shape} != {b.shape}"

  def test_ones_like_has_same_dtype_and_shape(self):
    for datatype in [dtypes.float16, dtypes.float32, dtypes.int8, dtypes.int32, dtypes.int64, dtypes.uint8]:
      a = Tensor([1, 2, 3], dtype=datatype)
      b = Tensor.ones_like(a)
      assert a.dtype == b.dtype, f"dtype mismatch {a.dtype=} != {b.dtype}"
      assert a.shape == b.shape, f"shape mismatch {a.shape} != {b.shape}"

    a = Tensor([1, 2, 3])
    b = Tensor.ones_like(a, dtype=dtypes.int8)
    assert a.dtype == dtypes.default_int and b.dtype == dtypes.int8, "a.dtype should be int and b.dtype should be char"
    assert a.shape == b.shape, f"shape mismatch {a.shape} != {b.shape}"

  def test_rand_like_device(self):
    a = Tensor.ones(3, 3, device="CPU")
    b = Tensor.rand_like(a)
    self.assertEqual(b.device, a.device)

  def test_ndim(self):
    assert Tensor(1).ndim == 0
    assert Tensor.randn(1).ndim == 1
    assert Tensor.randn(2,2,2).ndim == 3
    assert Tensor.randn(1,1,1,1,1,1).ndim == 6

  def test_argfix(self):
    for f in [Tensor.zeros, Tensor.ones, Tensor.rand, Tensor.randn, Tensor.empty]:
      self.assertEqual(f().shape, ())
      self.assertEqual(f(1).shape, (1,))
      self.assertEqual(f(10,20,40).shape, (10,20,40))
      self.assertEqual(f([]).shape, ())
      self.assertEqual(f([1]).shape, (1,))
      self.assertEqual(f([10,20,40]).shape, (10,20,40))
      self.assertEqual(f(()).shape, ())
      self.assertEqual(f((1,)).shape, (1,))
      self.assertEqual(f((10,20,40)).shape, (10,20,40))

      with self.assertRaises(ValueError): f((2, 2), 2, 2)
      with self.assertRaises(ValueError): f((2, 2), (2, 2))
      with self.assertRaises(ValueError): f((128, 128), 0.0, 0.01)

  def test_numel(self):
    assert Tensor.randn(10, 10).numel() == 100
    assert Tensor.randn(1,2,5).numel() == 10
    assert Tensor.randn(1,1,1,1,1,1).numel() == 1
    assert Tensor([]).numel() == 0
    assert Tensor.randn(1,0,2,5).numel() == 0
    assert Tensor(3).numel() == 1

  def test_len(self):
    assert len(torch.zeros(7)) == len(Tensor.zeros(7))
    assert len(torch.zeros(10,20)) == len(Tensor.zeros(10,20))
    assert len(torch.zeros(10,20)) == len(Tensor.zeros(10,20,30))
    assert len(torch.zeros(1).flatten()) == len(Tensor.zeros(1).flatten())
    with self.assertRaises(TypeError): len(Tensor(3))

  def test_size(self):
    t1, t2 = torch.zeros(10,20), Tensor.zeros(10,20)
    assert t1.size() == t2.size()
    assert t1.size(0) == t2.size(0)
    assert t1.size(1) == t2.size(1)
    assert t1.size(-1) == t2.size(-1)
    assert t1.size(-2) == t2.size(-2)
    with self.assertRaises(IndexError): t2.size(2)

  def test_tolist(self):
    # NOTE: float16 Tensor.tolist() requires python 3.12
    for arr in [[1,2,3], [1.5,2,3], [[1,2,3], [4,5,6]], 3]:
      assert Tensor(arr).tolist() == torch.tensor(arr).tolist() == arr

  def test_element_size(self):
    for _, dtype in dtypes.fields().items():
      assert dtype.itemsize == Tensor.randn(3, dtype=dtype).element_size(), f"Tensor.element_size() not matching Tensor.dtype.itemsize for {dtype}"

  def test_deepwalk_ctx_check(self):
    layer = Tensor.uniform(1, 1, requires_grad=True)
    x = Tensor.randn(1, 1, 1)
    x.dot(layer).mean().backward()
    x = Tensor.randn(1, 1, 1)
    x.dot(layer).mean().backward()

  def test_zerosized_tensors(self):
    np.testing.assert_equal(Tensor([]).numpy(), np.array([]))
    np.testing.assert_equal(Tensor(None).numpy(), np.array([]))

  def test_tensor_ndarray_dtype(self):
    arr = np.array([1]) # where dtype is implicitly int64
    assert Tensor(arr).dtype == dtypes.int64
    assert Tensor(arr, dtype=dtypes.float32).dtype == dtypes.float32 # check if ndarray correctly casts to Tensor dtype
    assert Tensor(arr, dtype=dtypes.float64).dtype == dtypes.float64 # check that it works for something else

  def test_tensor_from_blob(self):
    x = memoryview(bytearray(16)).cast('I')

    t = Tensor.from_blob(mv_address(x), (4,), dtype=dtypes.int, device="CPU")
    z = (t+1)
    np.testing.assert_equal(z.numpy(), [1, 1, 1, 1])

    x[:] = array.array('I', [0, 1, 2, 3])
    z = (t+1)
    np.testing.assert_equal(z.numpy(), [1, 2, 3, 4])

  def test_tensor_list_dtype(self):
    for arr in ([1], [[[1]]], [[1,1],[1,1]], [[[1,1],[1,1]],[[1,1],[1,1]]]):
      assert Tensor(arr).dtype == dtypes.default_int
      assert Tensor(arr, dtype=dtypes.float32).dtype == dtypes.float32
      assert Tensor(arr, dtype=dtypes.float64).dtype == dtypes.float64

    for arr in ([True], [[[False]]], [[True,False],[True,False]], [[[False,True],[False,False]],[[True,True],[False,True]]]):
      assert Tensor(arr).dtype == dtypes.bool
      assert Tensor(arr, dtype=dtypes.float32).dtype == dtypes.float32
      assert Tensor(arr, dtype=dtypes.float64).dtype == dtypes.float64

    # empty tensor defaults
    for arr in ([], [[[]]], [[],[]]):
      t = Tensor(arr)
      assert t.dtype == dtypes.default_float
      np.testing.assert_allclose(t.numpy(), np.array(arr))

    # mixture of bool and int
    for arr in ([True, 3], [[True],[3]], [[[True]], [[3]]], [[True, 3], [3, True]]):
      t = Tensor(arr)
      assert t.dtype == dtypes.default_int
      np.testing.assert_allclose(t.numpy(), np.array(arr))

    # mixture of bool, int and float
    for arr in ([[True,True],[3.,True]], [[0,1],[3.,4]], [[[0],[1]],[[3.],[4]]], [[[True],[1]],[[3.],[4]]]):
      t = Tensor(arr)
      assert t.dtype == dtypes.default_float
      np.testing.assert_allclose(t.numpy(), np.array(arr))

  def test_tensor_list_shapes(self):
    self.assertEqual(Tensor([[[]]]).shape, (1,1,0))
    self.assertEqual(Tensor([[],[]]).shape, (2,0))
    self.assertEqual(Tensor([[[[]],[[]]], [[[]],[[]]], [[[]],[[]]]]).shape, (3,2,1,0))

  def test_tensor_list_errors(self):
    # inhomogeneous shape
    with self.assertRaises(ValueError): Tensor([[],[[]]])
    with self.assertRaises(ValueError): Tensor([[1],[]])
    with self.assertRaises(ValueError): Tensor([[1],[1],1])
    with self.assertRaises(ValueError): Tensor([[[1,1,1],[1,1]]])
    with self.assertRaises(ValueError): Tensor([[1,1,1],[[1,1,1]]])

  def test_tensor_mixed_list_tuple(self):
    def _list_or_tuple(): return list if random.random() < 0.5 else tuple
    def _generate_data(depth):
      if depth == 0: return _list_or_tuple()()
      if depth == 1: return _list_or_tuple()([random.random(), random.random()])
      return _list_or_tuple()([_generate_data(depth-1), _generate_data(depth-1)])

    for depth in range(7):
      for _ in range(20):
        data = _generate_data(depth)
        np.testing.assert_allclose(Tensor(data).numpy(), np.array(data))

  def test_tensor_list_special_values(self):
    if is_dtype_supported(dtypes.float16):
      data = [math.nan, -math.inf, 65504, 65519, 65519.999, 65520, 65520.1]
      data = data + [-x for x in data]
      with np.errstate(over='ignore'): np.testing.assert_allclose(Tensor(data, dtype=dtypes.float16).numpy(), np.array(data).astype(np.float16))

    # uint32
    data = [1 << 33, 1 << 32, 1 << 32 - 1, 1]
    data = data + [-x for x in data]
    np.testing.assert_allclose(Tensor(data, dtype=dtypes.uint32).numpy(), np.array(data).astype(np.uint32))

    # int32
    data = [1 << 33, 1 << 32, 1 << 32 - 1, 1]
    data = data + [-x for x in data]
    np.testing.assert_allclose(Tensor(data, dtype=dtypes.int32).numpy(), np.array(data).astype(np.int32))

  def test_tensor_list_ndarray(self):
    data = [np.array([1, 2, 3]), np.array([1, 2, 3]), np.array([1, 2, 3])]
    np.testing.assert_equal(Tensor(data).numpy(), np.array(data))
    data = [np.array([1.0, 2.0, 3.0]), np.array([1, 2, 3]), np.array([1, 2, 3])]
    np.testing.assert_equal(Tensor(data).numpy(), np.array(data))
    data = [np.array(1.0), np.array(2.0), np.array(3.0)]
    np.testing.assert_equal(Tensor(data).numpy(), np.array(data))

  def test_tensor_dtype_errors(self):
    with self.assertRaises(AttributeError): Tensor([3], dtype="typo")
    with self.assertRaises(AttributeError): Tensor([3], dtype=(dtypes.int,))

  def test_tensor_bytes(self):
    data = b"abc123"
    t = Tensor(data)
    assert t.dtype == dtypes.uint8
    assert t.shape == (6,)
    np.testing.assert_equal(t.numpy(), list(data))

  def test_tensor_copy(self):
    x = copy.deepcopy(Tensor.ones((3,3,3)))
    np.testing.assert_allclose(x.numpy(), np.ones((3,3,3)))

  def test_copy_from_disk(self):
    t = Tensor.randn(30).to(f"disk:{temp('test_copy_from_disk')}")
    a = t[10:20]
    dev = a.to(Device.DEFAULT)
    np.testing.assert_allclose(a.numpy(), dev.numpy())

  # Regression test for https://github.com/tinygrad/tinygrad/issues/1751
  def test_copy_from_numpy_unaligned(self):
    # 2**15 is the minimum for repro
    arr = np.random.randn(2**15).astype(np.float32)
    fn = temp('test_copy_from_numpy_unaligned')
    with open(fn, 'wb') as f: f.write(b't' + arr.tobytes())
    with open(fn, "a+b") as f: memview = memoryview(mmap.mmap(f.fileno(), arr.nbytes + 1))
    ua_arr = np.frombuffer(memview[1:], dtype=arr.dtype, count=arr.shape[0])
    np.testing.assert_allclose(arr, ua_arr)
    assert not ua_arr.flags.aligned
    # force device copy - to() is opt'd away - Tensor(dev)/1 is ignored
    np.testing.assert_allclose(ua_arr, (Tensor(ua_arr)/Tensor(1)).numpy())

  def test_item_to_tensor_to_item(self):
    for a in [0, 1, 2, 3, -1, -100, 100, -101.1, 2.345, 100.1, True, False]:
      item = Tensor(a).item()
      assert type(item) is type(a), a
      np.testing.assert_allclose(item, a), a
      buffered_item = Tensor([a]).item()
      assert type(buffered_item) is type(a), a
      np.testing.assert_allclose(buffered_item, a), a
      reshaped_item = Tensor([a]).reshape((1, 1, 1, 1, 1)).item()
      assert type(reshaped_item) is type(a), a
      np.testing.assert_allclose(reshaped_item, a), a

  def test_no_bool(self):
    with self.assertRaises(TypeError):
      if Tensor(3):
        print("hi")

    with self.assertRaises(TypeError):
      _a = Tensor([3]) in [Tensor([3]), Tensor([4]), Tensor([5])]

  def test_repr_with_grad(self):
    a = Tensor([1], requires_grad=True)
    b = Tensor([1])
    c = (a + b).sum().backward()
    print(a)
    print(c)

  def test_env_overwrite_default_device(self):
    subprocess.run(['DISK=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT != \\"DISK\\""'],
                    shell=True, check=True)
    subprocess.run(['NPY=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT != \\"NPY\\""'],
                    shell=True, check=True)
    subprocess.run([f'{Device.DEFAULT}=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == \\"{Device.DEFAULT}\\""'],
                    shell=True, check=True)
    subprocess.run([f'DISK=1 {Device.DEFAULT}=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == \\"{Device.DEFAULT}\\""'],
                    shell=True, check=True)
    subprocess.run([f'NPY=1 {Device.DEFAULT}=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == \\"{Device.DEFAULT}\\""'],
                    shell=True, check=True)

  def test_no_attributeerror_after_apply_uop_exception(self):
    try:
      Tensor.arange(4).reshape(3,2)
    except ValueError:
      Tensor.zeros(2, 2).realize()

@unittest.skip("this test is just flaky, sync issue")
class TestMoveTensor(unittest.TestCase):
  d0, d1 = f"{Device.DEFAULT}:0", f"{Device.DEFAULT}:1"
  @given(strat.sampled_from([d0, d1]), strat.sampled_from([d0, d1]),
         strat.sampled_from([dtypes.float16, dtypes.float32]), strat.sampled_from([True, False, None]))
  def test_to_preserves(self, src, dest, dtype, requires_grad):
    if not is_dtype_supported(dtype):
      return
    s = Tensor([1, 2, 3], device=src, dtype=dtype, requires_grad=requires_grad)
    if requires_grad: s.sum().backward()
    t = s.to(dest)
    np.testing.assert_equal(s.numpy(), t.numpy())
    assert s.dtype == t.dtype
    assert s.requires_grad == t.requires_grad
    if requires_grad:
      np.testing.assert_equal(s.grad.numpy(), t.grad.numpy())

  @given(strat.sampled_from([dtypes.float16, dtypes.float32]), strat.sampled_from([True, False, None]))
  def test_shard_preserves(self, dtype, requires_grad):
    s = Tensor([1, 2, 3], dtype=dtype, requires_grad=requires_grad)
    t = s.shard((f"{Device.DEFAULT}:0", f"{Device.DEFAULT}:1"))
    np.testing.assert_equal(s.numpy(), t.numpy())
    assert s.dtype == t.dtype
    assert s.requires_grad == t.requires_grad

  @given(strat.sampled_from([d0, d1]))
  def test_same_dev(self, dev):
    x = Tensor([1,2,3], device=dev)
    y = x.to(dev)
    assert x is y

  def test_to_grad(self):
    x = Tensor.eye(3, requires_grad=True, device=self.d0)
    y = Tensor([[2.0,0,-2.0]], requires_grad=True, device=self.d0)
    z = y.matmul(x).to(self.d1).sum()
    z.backward()
    np.testing.assert_equal(x.grad.numpy(), [[2,2,2],[0,0,0],[-2,-2,-2]])

class TestZeroShapeTensor(unittest.TestCase):
  def test_shape_stride(self):
    t = Tensor.empty(3, 2, 0)
    assert t.shape == (3, 2, 0)
    # numpy has stride 0, 0, 0; torch has stride 2, 1, 1
    assert t.lazydata.st.real_strides() == (0, 0, 0)

    t = Tensor.empty(3, 0, 2)
    assert t.shape == (3, 0, 2)
    # numpy has stride 0, 0, 0; torch has stride 2, 2, 1
    assert t.lazydata.st.real_strides() == (0, 0, 0)

    t = Tensor.empty(0, 0, 0)
    assert t.shape == (0, 0, 0)
    # numpy has stride 0, 0, 0; torch has stride 1, 1, 1
    assert t.lazydata.st.real_strides() == (0, 0, 0)

  def test_rand(self):
    t = Tensor.rand(3, 2, 0)
    assert t.shape == (3, 2, 0)
    np.testing.assert_equal(t.numpy(), np.zeros((3, 2, 0)))
    t = Tensor.rand(0)
    assert t.shape == (0,)
    np.testing.assert_equal(t.numpy(), np.zeros((0,)))
    t = Tensor.rand(0, 0, 0)
    assert t.shape == (0, 0, 0)
    np.testing.assert_equal(t.numpy(), np.zeros((0, 0, 0)))

  def test_full(self):
    t = Tensor.zeros(3, 2, 0)
    assert t.shape == (3, 2, 0)
    np.testing.assert_equal(t.numpy(), np.zeros((3, 2, 0)))
    t = Tensor.full((3, 2, 0), 12)
    assert t.shape == (3, 2, 0)
    np.testing.assert_equal(t.numpy(), np.full((3, 2, 0), 12))

  def test_reshape(self):
    t = Tensor.zeros(3, 2, 0)
    a = t.reshape(7, 0)
    assert a.shape == (7, 0)
    np.testing.assert_equal(a.numpy(), np.zeros((7, 0)))
    a = t.reshape(0)
    assert a.shape == (0,)
    np.testing.assert_equal(a.numpy(), np.zeros((0,)))
    with self.assertRaises(ValueError):
      # cannot reshape from size 0 to size 1
      a = t.reshape(())

  def test_expand(self):
    t = Tensor.full((1, 2, 0), 12).expand((6, 2, 0))
    assert t.shape == (6, 2, 0)
    np.testing.assert_equal(t.numpy(), np.full((6, 2, 0), 12))

  def test_pad(self):
    t = Tensor.rand(3, 2, 0).pad((None, None, (1, 1)), value=1)
    assert t.shape == (3, 2, 2)
    np.testing.assert_equal(t.numpy(), np.ones((3, 2, 2)))

    t = Tensor.rand(3, 2, 0).pad((None, (1, 1), None), value=1)
    assert t.shape == (3, 4, 0)
    np.testing.assert_equal(t.numpy(), np.ones((3, 4, 0)))

    t = Tensor.rand(3, 2, 0).pad(((1, 1), None, None), value=1)
    assert t.shape == (5, 2, 0)
    np.testing.assert_equal(t.numpy(), np.ones((5, 2, 0)))

  def test_shrink_into_zero(self):
    t = Tensor.rand(3, 4).realize()
    assert t.shrink((None, (2, 2))).realize().shape == (3, 0)
    assert t.shrink(((2, 2), None)).realize().shape == (0, 4)
    assert t.shrink(((2, 2), (2, 2))).realize().shape == (0, 0)

  def test_cat(self):
    a = Tensor.rand(3, 2, 2)
    b = Tensor.rand(3, 2, 0)

    t = a.cat(b, dim=2)
    assert t.shape == (3, 2, 2)
    np.testing.assert_equal(t.numpy(), a.numpy())

    t = b.cat(a, dim=2)
    assert t.shape == (3, 2, 2)
    np.testing.assert_equal(t.numpy(), a.numpy())

    t = b.cat(b, dim=0)
    assert t.shape == (6, 2, 0)
    np.testing.assert_equal(t.numpy(), np.zeros((6, 2, 0)))
    t = b.cat(b, dim=1)
    assert t.shape == (3, 4, 0)
    np.testing.assert_equal(t.numpy(), np.zeros((3, 4, 0)))
    t = b.cat(b, dim=2)
    assert t.shape == (3, 2, 0)
    np.testing.assert_equal(t.numpy(), np.zeros((3, 2, 0)))

  def test_elementwise(self):
    a = Tensor.rand(3, 2, 0)
    a_exp = a.exp()
    assert a_exp.shape == (3, 2, 0)
    np.testing.assert_equal(a_exp.numpy(), np.exp(a.numpy()))

    b = Tensor.rand(3, 2, 0)
    assert b.shape == (3, 2, 0)
    ab = a * b
    assert ab.shape == (3, 2, 0)
    np.testing.assert_equal(ab.numpy(), a.numpy() * b.numpy())

    mask = (Tensor.rand(3, 2, 0) > 0.5)
    assert mask.shape == (3, 2, 0)
    c = mask.where(a, b)
    assert c.shape == (3, 2, 0)
    np.testing.assert_equal(c.numpy(), np.where(mask.numpy(), a.numpy(), b.numpy()))

  def test_reduce_over_non_zero(self):
    a = Tensor.ones(3, 2, 0).sum(axis=1)
    assert a.shape == (3, 0)
    np.testing.assert_equal(a.numpy(), np.sum(np.zeros((3, 2, 0)), axis=1))

  def test_reduce_over_zero(self):
    a = Tensor.ones(3, 2, 0).sum(axis=2)
    assert a.shape == (3, 2)
    np.testing.assert_equal(a.numpy(), np.sum(np.zeros((3, 2, 0)), axis=2))

    a = Tensor.ones(3, 2, 0).sum(axis=2, keepdim=True)
    assert a.shape == (3, 2, 1)
    np.testing.assert_equal(a.numpy(), np.sum(np.zeros((3, 2, 0)), axis=2, keepdims=True))

  def test_clone(self):
    a = Tensor.rand(16, 16).realize()
    b = a.clone()
    np.testing.assert_allclose(a.numpy(), b.numpy())
    self.assertIsNot(a.lazydata.base.buffer, b.lazydata.base.buffer)

    a = Tensor.rand(16, 16).mul(5.0).add(5.0)
    b = a.clone()
    np.testing.assert_allclose(a.numpy(), b.numpy())
    self.assertIsNot(a.lazydata.base.buffer, b.lazydata.base.buffer)

  def test_clone_with_shrink(self):
    a = Tensor.rand(16, 16)
    b = a.shrink(((2, 10), None)).clone()
    b.realize()
    self.assertIsNot(a.lazydata.base.buffer, b.lazydata.base.buffer)

  def test_clone_with_shrink_realized(self):
    a = Tensor.rand(16, 16).realize()
    b = a.shrink(((2, 10), None)).clone()
    b.realize()
    self.assertIsNot(a.lazydata.base.buffer, b.lazydata.base.buffer)

  def test_clone_with_grad(self):
    a = Tensor.rand(16, 16, requires_grad=True)
    a.mul(5.0).add(5.0).mean().backward()
    b = a.clone()
    assert a.grad is not None
    assert b.grad is not None
    np.testing.assert_allclose(a.grad.numpy(), b.grad.numpy())

  def test_reduce_default(self):
    np.testing.assert_equal(Tensor([]).max().numpy(), -float("inf"))
    np.testing.assert_equal(Tensor([]).min().numpy(), float("inf"))
    np.testing.assert_equal(Tensor([]).sum().numpy(), 0)
    np.testing.assert_equal(Tensor([]).mean().numpy(), float("nan"))

class TestTensorCreationDevice(unittest.TestCase):
  # test auxiliary tensors are created on the same device
  def test_one_hot(self):
    y = Tensor([1, 2, 3]).to("CPU")
    x = y.one_hot(10)
    x.realize()

class TestTrainMode(unittest.TestCase):
  def test_train_mode(self):
    assert not Tensor.training
    @Tensor.train()
    def f():
      assert Tensor.training
    f()
    assert not Tensor.training

class TestInferenceMode(unittest.TestCase):
  def test_inference(self):
    x = Tensor(x_init, requires_grad=True)
    m = Tensor(m_init, requires_grad=True)
    W = Tensor(W_init, requires_grad=True)
    with Tensor.test():
      tmp = x.mul(m)
      mm = tmp.matmul(W)
      out = mm.relu()
      out = out.sum()
      out.backward()
    assert x.grad is None
    assert m.grad is None
    assert tmp.grad is None
    assert mm.grad is None
    assert W.grad is None
    assert W.requires_grad

  def test_no_grad_mode_context_manager(self):
    x = Tensor(x_init, requires_grad=True)
    m = Tensor(m_init, requires_grad=True)
    W = Tensor(W_init, requires_grad=True)
    @Tensor.test()
    def f(x, m, W):
      tmp = x.mul(m)
      mm = tmp.matmul(W)
      out = mm.relu()
      out = out.sum()
      out.backward()
      assert x.grad is None
      assert m.grad is None
      assert tmp.grad is None
      assert mm.grad is None
      assert W.grad is None
    f(x, m, W)

class TestTensorMetadata(unittest.TestCase):
  def setUp(self) -> None: _METADATA.set(None)

  # NOOPs are not included in kernel metadata
  def test_exclude_noop_metadata(self):
    a = Tensor.rand(4, 4)*1
    self.assertEqual(a.lazydata.metadata.name, "__mul__")
    k = a.schedule()[-1]
    self.assertEqual([m.name for m in k.metadata], ["rand"])

  # we exclude const from kernel metadata because tensor methods can share the same CONST UOp
  @unittest.skip("TODO: flaky")
  def test_exclude_const_metadata(self):
    a = Tensor.arange(4)
    b = Tensor.full((4,), -1, dtype=dtypes.int).contiguous()
    sched = Tensor.schedule(a, b)
    self.assertEqual([m.name for m in sched[0].metadata], ["arange"])
    self.assertEqual([m.name for m in sched[1].metadata], ["contiguous"])

  def test_matmul(self):
    x = Tensor.rand(3, requires_grad=True)
    W = Tensor.rand(3, 3, requires_grad=True)
    out = x.matmul(W)
    self.assertEqual(out.lazydata.metadata.name, "matmul")
    si = out.schedule()[-1]
    self.assertEqual(len(si.metadata), 1)
    self.assertEqual(si.metadata[0].name, "matmul")

  def test_relu(self):
    x = Tensor.rand(3, requires_grad=True)
    out = x.relu()
    self.assertEqual(out.lazydata.metadata.name, "relu")
    si = out.schedule()[-1]
    self.assertEqual(len(si.metadata), 1)
    self.assertEqual(si.metadata[0].name, "relu")

  def test_complex(self):
    x = Tensor.rand(3, requires_grad=True)
    y = Tensor.rand(3, requires_grad=True)
    out = x.relu() * y.sigmoid()
    self.assertEqual(out.lazydata.metadata.name, "__mul__")
    self.assertEqual(out.lazydata.src[0].metadata.name, "relu")
    self.assertEqual(out.lazydata.src[1].metadata.name, "sigmoid")
    si = out.schedule()[-1]
    self.assertEqual(len(si.metadata), 3)
    self.assertEqual(set(m.name for m in si.metadata), {"relu", "sigmoid", "__mul__"})

  def test_complex_backward(self):
    x = Tensor.rand(3, requires_grad=True).realize()
    y = Tensor.rand(3, requires_grad=True).realize()
    out = (x.relu() * y.sigmoid()).sum()
    self.assertEqual(out.lazydata.metadata.name, "sum")
    out.backward()
    self.assertEqual(x.grad.lazydata.metadata.name, "relu")
    self.assertTrue(x.grad.lazydata.metadata.backward)
    self.assertEqual(y.grad.lazydata.metadata.name, "sigmoid")
    self.assertTrue(y.grad.lazydata.metadata.backward)
    si = Tensor.schedule(out, x.grad, y.grad)[-1]
    self.assertEqual(len(si.metadata), 3, f"failed with {si.metadata}")
    self.assertEqual(set(m.name for m in si.metadata), {"sigmoid", "sigmoid", "relu"})
    bw = [m for m in si.metadata if m.backward]
    self.assertEqual(len(bw), 1)
    self.assertEqual(bw[0].name, "sigmoid")

class TestIdxUpcast(unittest.TestCase):
  def _find_op(self, ast: UOp, op: Ops):
    if ast.op is op: return ast
    for src in ast.src:
      if (ret:=self._find_op(src, op)) is not None: return ret
  def _schedule_render(self, a: Tensor):
    schedule, _ = a.schedule_with_vars()
    for s in schedule:
      if s.ast.op is Ops.SINK:
        renderer = Device[s.bufs[0].device].renderer
        uops = linearize_uop(full_graph_rewrite(rewrite_shapetracker_with_index(s.ast, renderer), renderer))
        renderer.render(uops)
        return uops

  def _assert(self, dtype: DType, a: Tensor):
    uops = self._schedule_render(a)
    # Assert the dtype of the INDEX value, This will need be updated if UOp spec changes
    store = next(uop for uop in uops if uop.op is Ops.STORE)
    assert store.op is Ops.STORE
    idx = self._find_op(store, Ops.INDEX)
    if idx is not None: # PTX turns Ops.INDEX into pointer arithmetic earlier than cstyle, plus it's already cast to int64
      assert idx.op is Ops.INDEX
      idx_val = idx.src[1]
      assert idx_val.dtype is dtype

  # use expand to generate kernel that uses large idx
  def do_op_then_assert(self, dtype: DType, dim1, dim2, dim3):
    self._assert(dtype, Tensor.empty(dim1, dim2, 1).expand(-1, -1, dim3).contiguous())

  @unittest.skipUnless(is_dtype_supported(dtypes.long), "int64 is supported")
  def test_overflow(self):
    # 2**11, 2**11, 2**11 -> 2**33 will overflow when indexed
    self.do_op_then_assert(dtypes.long, 2048, 2048, 2048)

  @unittest.skipUnless(is_dtype_supported(dtypes.long), "int64 is supported")
  def test_overflow_sym(self):
    self.do_op_then_assert(dtypes.long, 2048, 2048, UOp.variable("dim3", 0, 2048).bind(32))

  def test_regular(self):
    self.do_op_then_assert(dtypes.int, 64, 64, 64)

  def test_regular_sym(self):
    self.do_op_then_assert(dtypes.int, 2048, 2048, UOp.variable("dim3", 0, 64).bind(32))

  @unittest.skipIf(PTX, "PTX always convert Ops.INDEX to int64")
  def test_symfold(self):
    # This would cause an overflow, but after sym fold it's within int32
    a = Tensor.arange(65535)
    uops = self._schedule_render(a)
    assert all(uop.dtype is not dtypes.long for uop in uops)

  @unittest.skipIf(is_dtype_supported(dtypes.long), "int64 is supported")
  def test_int64_unsupported_overflow_sym(self):
    with self.assertRaises(KeyError):
      self.do_op_then_assert(dtypes.long, 2048, 2048, UOp.variable("dim3", 0, 2048).bind(32))

  @unittest.skipIf(is_dtype_supported(dtypes.long), "int64 is supported")
  def test_int64_unsupported_overflow(self):
    with self.assertRaises(KeyError):
      self.do_op_then_assert(dtypes.long, 2048, 2048, 2048)

  @unittest.skip("This is kept for reference, it requires large memory to run")
  def test_overflow_kernel_run(self):
    # This creates a total of 2**31+10 elements, requiring at least 2147 MB memory to run
    # Modified example from issue 3271
    a = Tensor.empty(2**11, 2**11, 1, dtype=dtypes.int8).permute((2, 0, 1)).expand((2**9+10, -1, -1)).contiguous()
    a.realize()

if __name__ == '__main__':
  unittest.main()