You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
460 lines
24 KiB
460 lines
24 KiB
# inspired by https://github.com/karpathy/micrograd/blob/master/micrograd/engine.py
|
|
from __future__ import annotations
|
|
import math, functools, itertools
|
|
import numpy as np
|
|
from typing import List, Tuple, Callable, Optional, ClassVar, Type, Union, Sequence
|
|
from tinygrad.helpers import prod, argfix, make_pair, getenv, DEBUG, flatten
|
|
from tinygrad.lazy import Device, LazyBuffer
|
|
from tinygrad.image import image_conv2d_decorator
|
|
|
|
# An instantiation of the Function is the Context
|
|
class Function:
|
|
def __init__(self, device:str, *tensors:Tensor):
|
|
self.device, self.parents = device, tensors
|
|
self.needs_input_grad = [t.requires_grad for t in self.parents]
|
|
self.requires_grad = True if any(self.needs_input_grad) else (None if any(x is None for x in self.needs_input_grad) else False)
|
|
|
|
def forward(self, *args, **kwargs): raise NotImplementedError(f"forward not implemented for {type(self)}")
|
|
def backward(self, *args, **kwargs): raise RuntimeError(f"backward not implemented for {type(self)}")
|
|
|
|
@classmethod
|
|
def apply(fxn:Type[Function], *x:Tensor, **kwargs) -> Tensor:
|
|
ctx = fxn(x[0].device, *x)
|
|
ret = Tensor(ctx.forward(*[t.lazydata for t in x], **kwargs), device=ctx.device, requires_grad=ctx.requires_grad)
|
|
if ctx.requires_grad and not Tensor.no_grad: ret._ctx = ctx # used by autograd engine
|
|
return ret
|
|
|
|
import tinygrad.mlops as mlops
|
|
|
|
# **** start with two base classes, Tensor and Function ****
|
|
|
|
class Tensor:
|
|
__deletable__ = ('_ctx',)
|
|
training : ClassVar[bool] = False
|
|
no_grad : ClassVar[bool] = False
|
|
|
|
def __init__(self, data, device=Device.DEFAULT, requires_grad:Optional[bool]=None):
|
|
if isinstance(data, list):
|
|
data = np.array(data, dtype=np.float32)
|
|
elif isinstance(data, LazyBuffer) and data.device != device:
|
|
# TODO: this has to realize, it shouldn't have to
|
|
data = data.realize().toCPU()
|
|
|
|
if isinstance(data, np.ndarray):
|
|
data = data if data.shape else data.reshape((1,))
|
|
self.lazydata = LazyBuffer.fromCPU(data.astype(np.float32), device)
|
|
elif isinstance(data, LazyBuffer):
|
|
self.lazydata = data
|
|
else:
|
|
raise RuntimeError(f"can't create Tensor from {data}")
|
|
|
|
# tensors have gradients, buffers do not
|
|
self.grad : Optional[Tensor] = None
|
|
|
|
# NOTE: this can be in three states. False and None: no gradient, True: gradient
|
|
# None (the default) will be updated to True if it's put in an optimizer
|
|
self.requires_grad : Optional[bool] = requires_grad
|
|
|
|
# internal variables used for autograd graph construction
|
|
self._ctx : Optional[Function] = None
|
|
|
|
def __repr__(self):
|
|
return f"<Tensor {self.lazydata if self.lazydata.realized is None else self.lazydata.realized!r} with grad {(self.grad.lazydata if self.grad else None)!r}>"
|
|
|
|
@property
|
|
def shape(self) -> Tuple[int, ...]: return self.lazydata.shape
|
|
|
|
# dtype handling was very broken. it's always float32 now
|
|
@property
|
|
def dtype(self) -> type: return np.float32
|
|
|
|
@property
|
|
def device(self) -> str: return self.lazydata.device
|
|
|
|
# ***** data handlers ****
|
|
|
|
def realize(self) -> Tensor:
|
|
self.lazydata.realize()
|
|
return self
|
|
|
|
def assign(self, x) -> Tensor:
|
|
if not isinstance(x, Tensor): x = Tensor(x)
|
|
assert self.shape == x.shape
|
|
assert not x.requires_grad # self requires_grad is okay?
|
|
if DEBUG >= 4: print(f"assign {self.lazydata} <- {x.lazydata}")
|
|
if self.lazydata.realized is not None and not getenv("DISALLOW_ASSIGN"): x.lazydata.output_buffer = self.lazydata.realized
|
|
self.lazydata = x.lazydata
|
|
return self
|
|
|
|
def detach(self): return Tensor(self.lazydata, device=self.device, requires_grad=False)
|
|
def numpy(self) -> np.ndarray: return self.lazydata.toCPU()
|
|
|
|
# TODO: if things are realized this won't work
|
|
def to_(self, device:str):
|
|
assert self.lazydata.realized is None
|
|
self.lazydata.device = device
|
|
if self.grad:
|
|
self.grad.lazydata.device = device
|
|
|
|
def to(self, device:str):
|
|
ret = Tensor(self.lazydata, device)
|
|
if self.grad:
|
|
ret.grad = self.grad.to(device)
|
|
return ret
|
|
|
|
# ***** creation helper functions *****
|
|
|
|
@staticmethod
|
|
def zeros(*shape, **kwargs): return Tensor([0], **kwargs).reshape([1]*len(shape)).expand(shape).contiguous()
|
|
|
|
@staticmethod
|
|
def ones(*shape, **kwargs): return Tensor([1], **kwargs).reshape([1]*len(shape)).expand(shape).contiguous()
|
|
|
|
@staticmethod
|
|
def zeros_like(tensor, **kwargs): return Tensor.zeros(*tensor.shape, **kwargs)
|
|
|
|
@staticmethod
|
|
def empty(*shape, **kwargs): return Tensor.zeros(*shape, **kwargs)
|
|
|
|
@staticmethod
|
|
def eye(dim, **kwargs): return Tensor([1], **kwargs).slice(((0,dim+1),)).reshape(1, dim+1).expand(dim, dim+1).reshape(dim*(dim+1)).slice(((0,dim*dim),)).reshape(dim, dim)
|
|
|
|
# TODO: below line, remove use of numpy here and make lazy
|
|
# TODO: requires cumsum to remove numpy
|
|
@staticmethod
|
|
def arange(stop, start=0, step=1, **kwargs): return Tensor(np.arange(start=start, stop=stop, step=step, dtype=np.float32), **kwargs)
|
|
|
|
# ***** (numpy) rng helper functions *****
|
|
# TODO: move randomness generation out of numpy
|
|
|
|
_rng : ClassVar[np.random.Generator] = np.random.default_rng()
|
|
@staticmethod
|
|
def manual_seed(seed=None): Tensor._rng = np.random.default_rng(seed=seed)
|
|
|
|
@staticmethod
|
|
def rand(*shape, **kwargs) -> Tensor: return Tensor(Tensor._rng.random(size=shape, dtype=np.float32), **kwargs)
|
|
|
|
# TODO: replace with a transformation from uniform -> gaussian
|
|
@staticmethod
|
|
def randn(*shape, **kwargs) -> Tensor: return Tensor(Tensor._rng.standard_normal(size=shape, dtype=np.float32), **kwargs)
|
|
|
|
# ***** rng hlops *****
|
|
|
|
@staticmethod
|
|
def uniform(*shape, **kwargs) -> Tensor: return Tensor.rand(*shape, **kwargs) * 2 - 1
|
|
|
|
@staticmethod
|
|
def scaled_uniform(*shape, **kwargs) -> Tensor: return Tensor.uniform(*shape, **kwargs).mul(prod(shape)**-0.5)
|
|
|
|
# https://www.tensorflow.org/api_docs/python/tf/keras/initializers/GlorotUniform
|
|
@staticmethod
|
|
def glorot_uniform(*shape, **kwargs) -> Tensor: return Tensor.uniform(*shape, **kwargs).mul((6/(shape[0]+prod(shape[1:])))**0.5)
|
|
|
|
# ***** toposort and backward pass *****
|
|
|
|
def deepwalk(self):
|
|
def _deepwalk(node, visited, nodes):
|
|
visited.add(node)
|
|
if node._ctx:
|
|
for i in node._ctx.parents:
|
|
if i not in visited: _deepwalk(i, visited, nodes)
|
|
nodes.append(node)
|
|
return nodes
|
|
return _deepwalk(self, set(), [])
|
|
|
|
def backward(self):
|
|
assert self.shape == (1,)
|
|
|
|
# fill in the first grad with one
|
|
# this is "implicit gradient creation"
|
|
self.grad = Tensor.ones(*self.shape, device=self.device, requires_grad=False)
|
|
|
|
for t0 in reversed(self.deepwalk()):
|
|
if not any(x.requires_grad for x in t0._ctx.parents):
|
|
continue
|
|
assert (t0.grad is not None)
|
|
grads = t0._ctx.backward(t0.grad.lazydata)
|
|
grads = [Tensor(g, device=self.device, requires_grad=False) if g is not None else None
|
|
for g in ([grads] if len(t0._ctx.parents) == 1 else grads)]
|
|
for t, g in zip(t0._ctx.parents, grads):
|
|
if g is not None and t.requires_grad:
|
|
assert g.shape == t.shape, f"grad shape must match tensor shape in {self._ctx!r}, {g.shape!r} != {t.shape!r}"
|
|
t.grad = g if t.grad is None else (t.grad + g)
|
|
del t0._ctx
|
|
|
|
# ***** movement mlops *****
|
|
|
|
def reshape(self, shape, *args) -> Tensor:
|
|
new_shape = argfix(shape, *args)
|
|
assert len(new_shape) > 0 and all(x != 0 for x in new_shape), f"zeros not allowed in shape {new_shape}"
|
|
return mlops.Reshape.apply(self, shape=tuple(-prod(self.shape) // prod(new_shape) if s == -1 else s for s in new_shape))
|
|
def expand(self, shape, *args) -> Tensor: return mlops.Expand.apply(self, shape=tuple(x if x != -1 else s for s,x in zip(self.shape, argfix(shape, *args))))
|
|
def permute(self, order, *args) -> Tensor: return mlops.Permute.apply(self, order=argfix(order, *args))
|
|
def flip(self, axis, *args) -> Tensor: return mlops.Flip.apply(self, axis=argfix(axis, *args))
|
|
def pad(self, arg:Tuple[Tuple[int, int], ...]) -> Tensor: return mlops.Pad.apply(self, arg=arg) if any(x != (0,0) for x in arg) else self
|
|
def shrink(self, arg:Tuple[Tuple[int, int], ...]) -> Tensor: return mlops.Shrink.apply(self, arg=arg) if any(x != (0,s) for x,s in zip(arg, self.shape)) else self
|
|
|
|
# ***** movement hlops *****
|
|
|
|
# NOTE: using slice is discouraged and things should migrate to pad and shrink
|
|
def slice(self, arg:Sequence[Optional[Tuple[int, int]]]) -> Tensor:
|
|
arg_ = tuple(a if a is not None else (0,s) for s,a in zip(self.shape, arg))
|
|
padding = tuple((max(0, -p[0]), max(0, p[1]-self.shape[i])) for i,p in enumerate(arg_))
|
|
return self.pad(padding).shrink(tuple((p[0] + padding[i][0], p[1] + padding[i][0]) for i,p in enumerate(arg_)))
|
|
|
|
# Tensors mostly follow the normal python indexing / slicing behavior for sequences
|
|
# - Negative indices are taken relative to the end of the sequence, so X[-2] returns the 2nd-to-last element
|
|
# - A slice i:j returns the elements with indices in [i, j)
|
|
# - If omitted, i and j will default to 0 and N, respectively, where N is the length of the sequence
|
|
# - Negative values for i and j are taken relative to the end of the sequence
|
|
# - Both i and j will be clamped to the range (-N, N], where N in the length of the sequence
|
|
# - Indexing with np.newaxis or None on a given axis will add a new dimension of size one before that axis
|
|
# - Empty slices are not allowed
|
|
# - Strides other than 1 are not allowedå
|
|
def __getitem__(self, val):
|
|
def slcfix(i, sz, default): return default if i is None else max(0, min(sz, sz+i if i < 0 else i)) # Fix negative idxs, clamp to [0,N]
|
|
new_slice, new_shape = [], []
|
|
val = [val] if not isinstance(val, (list, tuple)) else val
|
|
assert sum(s is not None for s in val) <= len(self.shape)
|
|
assert all(s.step is None or s.step == 1 for s in val if isinstance(s, slice))
|
|
for i,(sz,s) in enumerate(zip(self.shape, [v for v in val if v is not None])): # Slicing only depends on ints + slices
|
|
if isinstance(s, int) and not (-sz <= s < sz):
|
|
raise IndexError(f"index {s} is out of bounds for dimension {i} with size {sz}")
|
|
new_slice.append((s%sz, s%sz+1) if isinstance(s, int) else (slcfix(s.start, sz, 0), slcfix(s.stop, sz, sz)))
|
|
for s,sz in zip(val, [self.shape[i-1] for i in itertools.accumulate([s is not None for s in val])]): # Shape depends on slices + positions of Nones
|
|
if not isinstance(s, int):
|
|
new_shape.append(1 if s is None else slcfix(s.stop, sz, sz) - slcfix(s.start, sz, 0))
|
|
new_shape += [self.shape[i] for i in range(len(new_slice), len(self.shape))]
|
|
new_slice += [(0,self.shape[i]) for i in range(len(new_slice), len(self.shape))]
|
|
return self.slice(new_slice).reshape(new_shape if len(new_shape) else (1,))
|
|
|
|
def cat(self, *args, dim=0):
|
|
dim = (dim + len(self.shape)) if dim < 0 else dim
|
|
for y in args:
|
|
assert len(y.shape) == len(self.shape) and all(y.shape[i] == s for i,s in enumerate(self.shape) if i != dim)
|
|
catargs = [self] + list(args)
|
|
shape_cumsum = [0, *itertools.accumulate([y.shape[dim] for y in catargs])]
|
|
slc = [[(0, s) for s in self.shape] for _ in catargs]
|
|
for s,k in zip(slc, shape_cumsum):
|
|
s[dim] = (-k, shape_cumsum[-1]-k)
|
|
return functools.reduce(Tensor.__add__, [arg.slice(s) for arg,s in zip(catargs, slc)])
|
|
|
|
# TODO: make this nicer with syntactic sugar in slice
|
|
def chunk(self, num, dim):
|
|
slice_params = [[(0, s) for s in self.shape] for _ in range(num)]
|
|
for i,k in enumerate(range(0, self.shape[dim], self.shape[dim]//num)):
|
|
slice_params[i][dim] = (k, min(self.shape[dim], k+self.shape[dim]//num))
|
|
return [self.slice(p) for p in slice_params]
|
|
|
|
def unsqueeze(self, dim):
|
|
if dim < 0: dim = len(self.shape) + dim + 1
|
|
return self.reshape(self.shape[:dim] + (1,) + self.shape[dim:])
|
|
|
|
# (padding_left, padding_right, padding_top, padding_bottom)
|
|
def pad2d(self, padding:Tuple[int, ...]): return self.slice(((0,self.shape[0]), (0,self.shape[1]), (-padding[2],self.shape[2]+padding[3]), (-padding[0],self.shape[3]+padding[1])))
|
|
# TODO: this is totally not transpose
|
|
def transpose(self, order=(1,0)) -> Tensor: return self.permute(order=order)
|
|
def flatten(self, start_dim=0): return self.reshape(shape=tuple(list(self.shape[0:start_dim]) + [-1]))
|
|
|
|
# ***** reduce ops *****
|
|
|
|
def _reduce(self, fxn:Type[Function], axis:Optional[Union[int, Tuple[int, ...]]]=None, keepdim=False):
|
|
axis_ : List[int] = list(range(len(self.shape))) if axis is None else ([axis] if isinstance(axis, int) else list(axis))
|
|
axis_ = [x if x >= 0 else x+len(self.shape) for x in axis_]
|
|
shape = [self.shape[i] for i in range(len(self.shape)) if i not in axis_]
|
|
ret = fxn.apply(self, new_shape=tuple(1 if i in axis_ else self.shape[i] for i in range(len(self.shape))))
|
|
return ret if keepdim else ret.reshape(shape=[1] if shape == [] else shape)
|
|
|
|
def sum(self, axis=None, keepdim=False): return self._reduce(mlops.Sum, axis, keepdim)
|
|
def max(self, axis=None, keepdim=False): return self._reduce(mlops.Max, axis, keepdim)
|
|
def min(self, axis=None, keepdim=False): return -((-self).max(axis=axis, keepdim=keepdim))
|
|
|
|
def mean(self, axis=None, keepdim=False):
|
|
out = self.sum(axis=axis, keepdim=keepdim)
|
|
return out * (prod(out.shape)/prod(self.shape))
|
|
|
|
def _softmax(self, axis):
|
|
m = self - self.max(axis=axis, keepdim=True)
|
|
e = m.exp()
|
|
return m, e, e.sum(axis=axis, keepdim=True)
|
|
|
|
def softmax(self, axis=-1):
|
|
_, e, ss = self._softmax(axis)
|
|
return e.div(ss)
|
|
|
|
def log_softmax(self, axis=-1):
|
|
m, _, ss = self._softmax(axis)
|
|
return m - ss.log()
|
|
|
|
# ***** processing ops *****
|
|
|
|
def _pool(self, k_:Tuple[int, ...], stride:Union[Tuple[int, ...], int]=1, dilation:Union[Tuple[int, ...], int]=1) -> Tensor:
|
|
assert len(self.shape) >= len(k_), f"can't pool {self.shape} with {k_}"
|
|
s_, d_ = make_pair(stride, len(k_)), make_pair(dilation, len(k_))
|
|
assert len(k_) == len(s_) and len(k_) == len(d_), f"stride/dilation mismatch kernel:{k_} stride:{s_} dilation:{d_}"
|
|
slc_prefix, prefix, i_ = [(0,x) for x in self.shape[0:-len(k_)]], self.shape[0:-len(k_)], self.shape[-len(k_):]
|
|
if any(k > s for k,s in zip(k_, s_)) or any(d != 1 for d in d_):
|
|
o_ = [(i - d * (k-1) - 1)//s + 1 for i,d,k,s in zip(i_, d_, k_, s_)]
|
|
e_ = [math.ceil(k*(i+d) / i) for k,i,d in zip(k_, i_, d_)] # expands such that we don't need padding
|
|
xup = self.reshape(*prefix, *flatten((1,i) for i in i_)).expand(*prefix, *flatten((e,i) for e,i in zip(e_, i_))).reshape(*prefix, *[e*i for e,i in zip(e_, i_)])
|
|
# slide by dilation
|
|
xup = xup.slice(slc_prefix + [(0,k*(i+d)) for k,i,d in zip(k_, i_, d_)])
|
|
xup = xup.reshape(*prefix, *flatten((k,i+d) for k,i,d in zip(k_, i_, d_)))
|
|
xup = xup.slice(slc_prefix + flatten(((0,k), (0,o*s)) for k,o,s in zip(k_, o_, s_)))
|
|
# handle stride, and permute to move reduce to the end
|
|
xup = xup.reshape(*prefix, *flatten((k,o,s) for k,o,s in zip(k_, o_, s_)))
|
|
xup = xup.slice(slc_prefix + flatten(((0,k), (0,o), (0,1)) for k,o in zip(k_, o_)))
|
|
xup = xup.reshape(*prefix, *flatten((k,o) for k,o in zip(k_, o_)))
|
|
return xup.permute(*range(len(prefix)), *[len(prefix)+i*2+1 for i in range(len(k_))], *[len(prefix)+i*2 for i in range(len(k_))])
|
|
else:
|
|
# TODO: once the shapetracker can optimize well, remove this alternative implementation. or not if the CPU implementation doesn't use ShapeTracker
|
|
o_ = [(i+(s-k))//s for i,s,k in zip(i_, s_, k_)]
|
|
xup = self.slice(slc_prefix + [(0,o*s) for o,s in zip(o_, s_)])
|
|
xup = xup.reshape(*prefix, *flatten(((o, s) for o,s in zip(o_, s_))))
|
|
xup = xup.slice(slc_prefix + flatten(((0,o), (0,k)) for o,k in zip(o_, k_)))
|
|
return xup.permute(*range(len(prefix)), *[len(prefix)+i*2 for i in range(len(k_))], *[len(prefix)+i*2+1 for i in range(len(k_))])
|
|
|
|
# NOTE: these work for more than 2D
|
|
def avg_pool2d(self, kernel_size=(2,2), stride=None): return self._pool(make_pair(kernel_size), stride if stride is not None else kernel_size).mean(axis=tuple(range(0-len(make_pair(kernel_size)), 0)))
|
|
def max_pool2d(self, kernel_size=(2,2), stride=None): return self._pool(make_pair(kernel_size), stride if stride is not None else kernel_size).max(axis=tuple(range(0-len(make_pair(kernel_size)), 0)))
|
|
|
|
@image_conv2d_decorator
|
|
def conv2d(self, weight:Tensor, bias:Optional[Tensor]=None, groups=1, stride=1, dilation=1, padding=0) -> Tensor:
|
|
(bs,cin_,_,_), (cout,cin,H,W) = self.shape, weight.shape
|
|
assert cin*groups == cin_, f"Input Tensor shape {self.shape} does not match the shape of the weights {weight.shape}. ({cin*groups} vs. {cin_})"
|
|
padding_ = [padding]*4 if isinstance(padding, int) else (padding if len(padding) == 4 else [padding[1], padding[1], padding[0], padding[0]])
|
|
|
|
# conv2d is a pooling op (with padding)
|
|
x = self.pad2d(padding_)._pool((H,W),stride, dilation)
|
|
|
|
oy, ox, rcout = x.shape[2], x.shape[3], cout//groups
|
|
# NOTE: we do this expand explicitly so the permute isn't pushed in the binop
|
|
x = x.reshape(bs, groups, 1, cin, oy, ox, H, W).expand(bs, groups, rcout, cin, oy, ox, H, W).permute(0,1,2,4,5,3,6,7)
|
|
|
|
# conv! broadcasted to (bs, groups, rcout, oy, ox, cin, H, W)
|
|
ret = (x * weight.reshape(1, groups, rcout, 1, 1, cin, H, W)).sum((-3, -2, -1)).reshape(bs, cout, oy, ox)
|
|
return ret if bias is None else ret.add(bias.reshape(1, -1, 1, 1))
|
|
|
|
def dot(self, w:Tensor) -> Tensor:
|
|
# NOTE: we use a 1x1 conv2d to do the matmul. mxk @ kxn = (1,k,m,1).conv2d(n,k,1,1)
|
|
bs, groups = prod(self.shape[0:-2]), prod(w.shape[0:-2])
|
|
cin, cout = w.shape[-2], w.shape[-1]
|
|
out_shape_t = self.shape[0:-2] + (cout,-1)
|
|
if len(self.shape) > 1:
|
|
order = tuple(range(len(self.shape)-2)) + (len(self.shape)-1, len(self.shape)-2)
|
|
else:
|
|
order, out_shape_t = (0,), (cout, )
|
|
worder = tuple(range(len(w.shape)-2)) + (len(w.shape)-1, len(w.shape)-2)
|
|
|
|
# NOTE: with NHWC we can remove the transposes
|
|
# bs x groups*cin x H x W
|
|
cx = self.transpose(order=order).reshape(shape=(bs//groups, groups*cin, -1, 1))
|
|
# groups*cout x cin x H, W
|
|
cw = w.transpose(order=worder).reshape(shape=(groups*cout, cin, 1, 1))
|
|
return cx.conv2d(cw, groups=groups).reshape(shape=out_shape_t).transpose(order=order)
|
|
|
|
# ***** mlops (unary) *****
|
|
|
|
def contiguous(self): return mlops.Contiguous.apply(self)
|
|
def log(self): return mlops.Log.apply(self)
|
|
def exp(self): return mlops.Exp.apply(self)
|
|
|
|
# ***** math functions (unary) *****
|
|
|
|
def __neg__(self): return 0.0-self
|
|
def sqrt(self): return self.pow(0.5)
|
|
def square(self): return self*self
|
|
def clip(self, min_, max_): return ((self-min_).relu()+min_) - (self-max_).relu()
|
|
def abs(self): return self.relu() + (-self).relu()
|
|
def sign(self): return self / (self.abs() + 1e-10)
|
|
def relu(self): return self.maximum(0)
|
|
def reciprocal(self): return 1.0/self
|
|
|
|
# ***** activation functions (unary) *****
|
|
|
|
def sigmoid(self): return (1.0 + (-self).exp()).reciprocal()
|
|
def elu(self, alpha=1.0): return self.relu() - alpha*(1-self.exp()).relu()
|
|
def swish(self): return self * self.sigmoid()
|
|
def silu(self): return self.swish() # The SiLU function is also known as the swish function.
|
|
def relu6(self): return self.relu() - (self-6).relu()
|
|
def hardswish(self): return self * (self+3).relu6() * (1/6)
|
|
def tanh(self): return 2.0 * ((2.0 * self).sigmoid()) - 1.0
|
|
def gelu(self): return 0.5 * self * (1 + (self * 0.7978845608 * (1 + 0.044715 * self * self)).tanh())
|
|
def quick_gelu(self): return self * (self * 1.702).sigmoid()
|
|
def leakyrelu(self, neg_slope=0.01): return self.relu() - (-neg_slope*self).relu()
|
|
def mish(self): return self * self.softplus().tanh()
|
|
def softplus(self, beta=1): return (1/beta) * (1 + (self*beta).exp()).log()
|
|
|
|
# ***** broadcasted binary mlops *****
|
|
|
|
def _broadcasted(self, fxn:Type[Function], other:Union[Tensor, float], reverse:bool=False) -> Tensor:
|
|
x,y = [Tensor([t], device=self.device, requires_grad=False) if not isinstance(t, Tensor) else t for t in ([other,self] if reverse else [self,other])]
|
|
x,y = [t.reshape([1]*(max(len(x.shape), len(y.shape))-len(t.shape)) + list(t.shape)) for t in [x,y]]
|
|
shape_ret = tuple(max(sx, sy) for sx,sy in zip(x.shape, y.shape))
|
|
return fxn.apply(x.expand(shape_ret), y.expand(shape_ret))
|
|
|
|
def add(self, x:Union[Tensor, float], reverse=False) -> Tensor: return self._broadcasted(mlops.Add, x, reverse) if isinstance(x, Tensor) or x != 0.0 else self
|
|
def sub(self, x:Union[Tensor, float], reverse=False) -> Tensor: return self._broadcasted(mlops.Sub, x, reverse) if isinstance(x, Tensor) or x != 0.0 or reverse else self
|
|
def mul(self, x:Union[Tensor, float], reverse=False) -> Tensor: return self._broadcasted(mlops.Mul, x, reverse) if isinstance(x, Tensor) or x != 1.0 else self
|
|
def pow(self, x:Union[Tensor, float], reverse=False) -> Tensor: return self._broadcasted(mlops.Pow, x, reverse) if isinstance(x, Tensor) or x != 1.0 or reverse else self
|
|
def div(self, x:Union[Tensor, float], reverse=False) -> Tensor: return self._broadcasted(mlops.Div, x, reverse) if isinstance(x, Tensor) or x != 1.0 or reverse else self
|
|
def matmul(self, x:Tensor, reverse=False) -> Tensor: return x.dot(self) if reverse else self.dot(x)
|
|
|
|
def maximum(self, x:Union[Tensor, float]) -> Tensor: return self._broadcasted(mlops.Maximum, x)
|
|
def minimum(self, x:Union[Tensor, float]) -> Tensor: return -((-self).maximum(-x))
|
|
def eq(self, x) -> Tensor: return self._broadcasted(mlops.Equal, x, False)
|
|
|
|
# ***** binary op wrappers (18 wasted lines to make the typechecker happy) *****
|
|
|
|
# NOTE: __pow__ and friends are broken in mypyc with the ** operator
|
|
def __add__(self, x) -> Tensor: return self.add(x)
|
|
def __sub__(self, x) -> Tensor: return self.sub(x)
|
|
def __mul__(self, x) -> Tensor: return self.mul(x)
|
|
def __pow__(self, x) -> Tensor: return self.pow(x)
|
|
def __truediv__(self, x) -> Tensor: return self.div(x)
|
|
def __matmul__(self, x) -> Tensor: return self.matmul(x)
|
|
|
|
def __radd__(self, x) -> Tensor: return self.add(x, True)
|
|
def __rsub__(self, x) -> Tensor: return self.sub(x, True)
|
|
def __rmul__(self, x) -> Tensor: return self.mul(x, True)
|
|
def __rpow__(self, x) -> Tensor: return self.pow(x, True)
|
|
def __rtruediv__(self, x) -> Tensor: return self.div(x, True)
|
|
def __rmatmul__(self, x) -> Tensor: return self.matmul(x, True)
|
|
|
|
def __iadd__(self, x) -> Tensor: return self.assign(self.add(x))
|
|
def __isub__(self, x) -> Tensor: return self.assign(self.sub(x))
|
|
def __imul__(self, x) -> Tensor: return self.assign(self.mul(x))
|
|
def __ipow__(self, x) -> Tensor: return self.assign(self.pow(x))
|
|
def __itruediv__(self, x) -> Tensor: return self.assign(self.div(x))
|
|
def __imatmul__(self, x) -> Tensor: return self.assign(self.matmul(x))
|
|
|
|
def __ge__(self, x) -> Tensor: return self.maximum(x).eq(self)
|
|
def __le__(self, x) -> Tensor: return self.maximum(x).eq(x)
|
|
def __lt__(self, x) -> Tensor: return 1.0-(self>=x)
|
|
def __gt__(self, x) -> Tensor: return 1.0-(self<=x)
|
|
|
|
# ***** functional nn ops *****
|
|
|
|
def linear(self, weight:Tensor, bias:Optional[Tensor]=None):
|
|
x = self.mul(weight) if len(weight.shape) == 1 else self.dot(weight)
|
|
return x.add(bias) if bias is not None else x
|
|
|
|
def sequential(self, ll:List[Callable[[Tensor], Tensor]]): return functools.reduce(lambda x,f: f(x), ll, self)
|
|
|
|
def layernorm(self, axis=-1, eps:float=1e-5) -> Tensor:
|
|
y = (self - self.mean(axis=axis, keepdim=True))
|
|
return y.div((y*y).mean(axis=axis, keepdim=True).add(eps).sqrt())
|
|
|
|
def batchnorm(self, weight:Tensor, bias:Tensor, mean:Tensor, invstd:Tensor) -> Tensor:
|
|
x = (self - mean.reshape(shape=[1, -1, 1, 1])) * weight.reshape(shape=[1, -1, 1, 1])
|
|
return x.mul(invstd.reshape(shape=[1, -1, 1, 1])) + bias.reshape(shape=[1, -1, 1, 1])
|
|
|
|
def dropout(self, p=0.5) -> Tensor:
|
|
if not Tensor.training: return self
|
|
_mask : np.ndarray = np.asarray(Tensor._rng.binomial(1, 1.0-p, size=self.shape), dtype=self.dtype)
|
|
return self * Tensor(_mask, requires_grad=False, device=self.device) * (1/(1.0 - p))
|
|
|
|
# register functions to move between devices
|
|
for device in [device for device in Device._buffers.keys() if device[0] != "_"]:
|
|
setattr(Tensor, f"{device.lower()}", functools.partialmethod(Tensor.to, device))
|
|
setattr(Tensor, f"{device.lower()}_", functools.partialmethod(Tensor.to_, device))
|
|
|