You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
222 lines
6.1 KiB
222 lines
6.1 KiB
#!/usr/bin/env python3
|
|
from pathlib import Path
|
|
from ctypes import *
|
|
import json
|
|
import collections
|
|
import numpy as np
|
|
import faulthandler
|
|
import struct
|
|
faulthandler.enable()
|
|
|
|
basedir = Path(__file__).resolve().parent
|
|
|
|
libane = None
|
|
aneregs = None
|
|
def init_libane():
|
|
global libane, aneregs
|
|
libane = cdll.LoadLibrary((basedir / "libane.dylib").as_posix())
|
|
|
|
libane.ANE_Compile.argtypes = [c_char_p, c_int]
|
|
libane.ANE_Compile.restype = c_void_p
|
|
|
|
libane.ANE_TensorCreate.restype = c_void_p
|
|
|
|
libane.ANE_TensorData.argtypes = [c_void_p]
|
|
libane.ANE_TensorData.restype = POINTER(c_uint16)
|
|
|
|
libane.ANE_Run.argtypes = [c_void_p]*4
|
|
libane.ANE_Run.restype = c_int
|
|
|
|
#libane.ANE_RegDebug.restype = c_char_p
|
|
|
|
with open(basedir / "aneregs.json") as f:
|
|
aneregs = json.load(f)
|
|
|
|
ANE_Struct = [
|
|
# aneTD.Header
|
|
("u32", 0x1C, "NextCommandOffset"),
|
|
|
|
# KernelDMASrc @ section @ 0x2C len 0xF4
|
|
# reloc 0x2c-0x34?? = weights
|
|
# u32[16] 0x34-0x74 = 0x80 | 1 if used
|
|
# u32[16] 0x74-0xB4 = <channel data offset>
|
|
# u32[16] 0xB4-0xF4 = <channel data length>
|
|
|
|
# Common @ section @ 0x128 len 0x3C (conv)
|
|
("u16", 0x128, "InputWidth"),
|
|
("u16", 0x12A, "InputHeight"),
|
|
("u16", 0x12C, "InputDepth"),
|
|
|
|
("u32", 0x130, "InputOutputType"), # (OutputType * 0x10) | InputType
|
|
# UInt8 = 0, Int8 = 1, Float16 = 2
|
|
|
|
("u32", 0x134, "InputChannels"),
|
|
("u32", 0x138, "OutputChannels"),
|
|
|
|
("u16", 0x13C, "OutputWidth"),
|
|
("u16", 0x13E, "OutputHeight"),
|
|
("u16", 0x140, "OutputDepth"),
|
|
|
|
("u16", 0x144, "KernelSize"), # 0xa000 | (KernelHeight * 0x20) | KernelWidth
|
|
("u16", 0x146, "Padding"), # 0x5000 | (PadTop * 0x40) | (PadLeft * 2)
|
|
|
|
("u16", 0x14C, "BatchSize"),
|
|
|
|
# TileDMASrc @ section @ 0x16C len 0x6C (input)
|
|
# reloc 0x16c-0x174 = image
|
|
("u32", 0x178, "InputRowStride"),
|
|
("u32", 0x17C, "InputPlaneStride"),
|
|
("u32", 0x180, "InputDepthStride"),
|
|
("u32", 0x184, "InputBatchStride"),
|
|
|
|
("u8", 0x1A7, "InputInterleave"),
|
|
|
|
# L2 @ section @ 0x1E0 len 0x44
|
|
# [0x1ec, 0x1f0, 0x1f4, 0x1f8, 0x214] = number of engines
|
|
# [0x1f0, 0x1f4, 0x1f8, 0x214] = engines for inconv?
|
|
# [0x21c, 0x220, 0x224] = engines for outconv?
|
|
|
|
# NE @ section @ 0x22c len 0xC (scaling)
|
|
("u16", 0x230, "BiasScalar"),
|
|
("u16", 0x232, "ScaleScalar"),
|
|
|
|
# section @ 0x240 len 0x10
|
|
("u16", 0x246, "NeuronType"), # 0x10 = copy, 0x11 = ReLU, 0x12 = custom
|
|
("u32", 0x250, "PostScale"),
|
|
|
|
# TileDMADst @ section @ 0x258 len 0x18
|
|
|
|
# HandleTileDmaDstConfig
|
|
# 0x258 -- *(uint *)(this + 0x334) = *(uint *)(this + 0x334) & 0xfffffc3f | 0xc0;
|
|
# (GetCacheHintRegisterValue & 0xf) << 6;
|
|
("u32", 0x25C, "OutputOffset"), # offset into output buffer to write at?
|
|
|
|
# 0x260 -- *(uint *)(this + 0x33c) = *(uint *)(this + 0x33c) & 0x3f | (int)uVar10 << 6;
|
|
("u32", 0x260, "OutputRowStride"),
|
|
("u32", 0x264, "OutputPlaneStride"),
|
|
("u32", 0x268, "OutputDepthStride"),
|
|
("u32", 0x26C, "OutputBatchStride"),
|
|
|
|
# 0x270 -- *(uint *)(this + 0x34c) = *(uint *)(this + 0x34c) & 0xf0ffffff | 0x1000000;
|
|
# uVar6 = *(uint *)(this + 0x34c) & 0xffffcfcc | 0x2031;
|
|
# (ZinTensorDescriptorDmaInterleave & 0xf) << 0x18;
|
|
("u8", 0x273, "OutputInterleave"), # i also have this at 0x211?
|
|
]
|
|
|
|
ANE_Struct_Dict = {}
|
|
for typ, num, nam in ANE_Struct:
|
|
styp = {"u32": "I", "u16": "H", "u8": "B"}[typ]
|
|
ANE_Struct_Dict[nam] = (styp, num)
|
|
|
|
class ANETensor:
|
|
def __init__(self, *shape):
|
|
self.shape = shape
|
|
self.dtype = np.float16
|
|
self.sz = int(np.prod(shape))
|
|
assert(self.sz <= 0x4000)
|
|
self.tt = libane.ANE_TensorCreate(self.sz, 1)
|
|
assert(self.tt is not None)
|
|
|
|
def data(self):
|
|
data = libane.ANE_TensorData(self.tt)
|
|
assert(data is not None)
|
|
#print(hex(addressof(data.contents)))
|
|
buf = np.ctypeslib.as_array(data, shape=(self.sz,))
|
|
ret = np.frombuffer(buf, dtype=self.dtype)
|
|
#print(ret.data)
|
|
return ret
|
|
|
|
class ANE:
|
|
def __init__(self):
|
|
init_libane()
|
|
libane.ANE_Open()
|
|
|
|
def compile(self, dat):
|
|
ret = libane.ANE_Compile(create_string_buffer(dat), len(dat))
|
|
assert(ret is not None)
|
|
return ret
|
|
|
|
def run(self, prog, tin, tout, tweights=None):
|
|
libane.ANE_Run(prog, tin.tt, tout.tt, tweights.tt if tweights is not None else 0)
|
|
|
|
def tensor(self, shape):
|
|
return ANETensor(shape)
|
|
|
|
def unpack(self, dat):
|
|
dat = struct.unpack("Q"*(len(dat)//8), dat)
|
|
ret = {}
|
|
for k,v in aneregs:
|
|
by,bi,sz = v
|
|
bi += (by%8)*8
|
|
by //= 8
|
|
rv = (dat[by] >> bi) & ((1 << sz)-1)
|
|
ret[k] = rv
|
|
return ret
|
|
|
|
def pack(self, pk, dat):
|
|
dat = list(struct.unpack("Q"*(len(dat)//8), dat))
|
|
for k,v in aneregs:
|
|
by,bi,sz = v
|
|
bi += (by%8)*8
|
|
by //= 8
|
|
dat[by] &= ~(((1 << sz)-1) << bi)
|
|
dat[by] |= pk[k] << bi
|
|
dat = struct.pack("Q"*len(dat), *dat)
|
|
return dat
|
|
|
|
def debug(self, dat, mems=0):
|
|
add = [0x30, 0x1d4, 0x220, 0x29c, 0x2f0, 0x30c, 0x32c]
|
|
lens = [244, 60, 108, 68, 12, 16, 24]
|
|
ptr = 0x2b
|
|
ddat = dat[0:0x28]
|
|
for a, pm in zip(add, lens):
|
|
#assert pm == dat[ptr]
|
|
ddat += b"\x00" * (a-len(ddat))
|
|
ddat += dat[ptr+1:ptr+1+pm+4]
|
|
ptr += pm+8
|
|
ddat += b"\x00" * 0x100
|
|
ret = collections.OrderedDict()
|
|
for ln in libane.ANE_RegDebug(0, create_string_buffer(ddat), mems).decode('utf-8').strip().split("\n"):
|
|
lnn = ln.split(" = ")
|
|
if len(lnn) == 2:
|
|
ret[lnn[0]] = int(lnn[1])
|
|
return ret
|
|
|
|
def filln(self, dat, nvdict, base=0x4000):
|
|
for n,v in nvdict.items():
|
|
styp, num = ANE_Struct_Dict[n]
|
|
dat = self.fill(dat, [num], styp, v)
|
|
return dat
|
|
|
|
def fill(self, dat, addrs, type, val, base=0x4000):
|
|
x = struct.pack(type, val)
|
|
for a in addrs:
|
|
dat[base+a:base+a+len(x)] = x
|
|
return dat
|
|
|
|
if __name__ == "__main__":
|
|
ane = ANE()
|
|
|
|
tin = ANETensor(16)
|
|
tout = ANETensor(16)
|
|
|
|
tind = tin.data()
|
|
toutd = tout.data()
|
|
|
|
tind[0:4] = [-1,1,-2,2]
|
|
print("** before **")
|
|
print(tind)
|
|
print(toutd)
|
|
|
|
dat = open("../ops/relu.hwx", "rb").read()
|
|
md = dat[0x4000:0x4300]
|
|
dd = ane.unpack(md)
|
|
mdf = ane.pack(dd, md)
|
|
assert(md == mdf)
|
|
|
|
comp = ane.compile(dat)
|
|
ret = ane.run(comp, tin, tout)
|
|
print("** after **")
|
|
print(tind)
|
|
print(toutd)
|
|
|
|
|