# Implementation of waifu2x vgg7 in tinygrad. # Obviously, not developed, supported, etc. by the original waifu2x author(s). import numpy from tinygrad.tensor import Tensor from PIL import Image from tinygrad.helpers import fetch # File Formats # tinygrad convolution tensor input layout is (1,c,y,x) - and therefore the form for all images used in the project # tinygrad convolution tensor weight layout is (outC,inC,H,W) - this matches NCNN (and therefore KINNE), but not waifu2x json def image_load(path) -> numpy.ndarray: """ Loads an image in the shape expected by other functions in this module. Doesn't Tensor it, in case you need to do further work with it. """ # file na = numpy.array(Image.open(path)) if na.shape[2] == 4: # RGBA -> RGB (covers opaque images with alpha channels) na = na[:,:,0:3] # fix shape na = numpy.moveaxis(na, [2,0,1], [0,1,2]) # shape is now (3,h,w), add 1 na = na.reshape(1,3,na.shape[1],na.shape[2]) # change type na = na.astype("float32") / 255.0 return na def image_save(path, na: numpy.ndarray): """ Saves an image of the shape expected by other functions in this module. However, note this expects a numpy array. """ # change type na = numpy.fmax(numpy.fmin(na * 255.0, 255), 0).astype("uint8") # shape is now (1,3,h,w), remove 1 na = na.reshape(3,na.shape[2],na.shape[3]) # fix shape na = numpy.moveaxis(na, [0,1,2], [2,0,1]) # shape is now (h,w,3) # file Image.fromarray(na).save(path) # The Model class Conv3x3Biased: """ A 3x3 convolution layer with some utility functions. """ def __init__(self, inC, outC, last = False): # The properties must be named as "W" and "b". # This is in an attempt to try and be roughly compatible with https://github.com/FHPythonUtils/Waifu2x # though this cannot necessarily account for transposition and other such things. # Massively overstate the weights to get them to be focused on, # since otherwise the biases overrule everything self.W = Tensor.uniform(outC, inC, 3, 3) * 16.0 # Layout-wise, blatant cheat, but serious_mnist does it. I'd guess channels either have to have a size of 1 or whatever the target is? # Values-wise, entirely different blatant cheat. # In most cases, use uniform bias, but tiny. # For the last layer, use just 0.5, constant. if last: self.b = Tensor.zeros(1, outC, 1, 1) + 0.5 else: self.b = Tensor.uniform(1, outC, 1, 1) def forward(self, x): # You might be thinking, "but what about padding?" # Answer: Tiling is used to stitch everything back together, though you could pad the image before providing it. return x.conv2d(self.W).add(self.b) def get_parameters(self) -> list: return [self.W, self.b] def load_waifu2x_json(self, layer: dict): # Weights in this file are outChannel,inChannel,X,Y. # Not outChannel,inChannel,Y,X. # Therefore, transpose it before assignment. # I have long since forgotten how I worked this out. self.W.assign(Tensor(layer["weight"]).reshape(shape=self.W.shape).transpose(2, 3)) self.b.assign(Tensor(layer["bias"]).reshape(shape=self.b.shape)) class Vgg7: """ The 'vgg7' waifu2x network. Lower quality and slower than even upconv7 (nevermind cunet), but is very easy to implement and test. """ def __init__(self): self.conv1 = Conv3x3Biased(3, 32) self.conv2 = Conv3x3Biased(32, 32) self.conv3 = Conv3x3Biased(32, 64) self.conv4 = Conv3x3Biased(64, 64) self.conv5 = Conv3x3Biased(64, 128) self.conv6 = Conv3x3Biased(128, 128) self.conv7 = Conv3x3Biased(128, 3, True) def forward(self, x): """ Forward pass: Actually runs the network. Input format: (1, 3, Y, X) Output format: (1, 3, Y - 14, X - 14) (the - 14 represents the 7-pixel context border that is lost) """ x = self.conv1.forward(x).leaky_relu(0.1) x = self.conv2.forward(x).leaky_relu(0.1) x = self.conv3.forward(x).leaky_relu(0.1) x = self.conv4.forward(x).leaky_relu(0.1) x = self.conv5.forward(x).leaky_relu(0.1) x = self.conv6.forward(x).leaky_relu(0.1) x = self.conv7.forward(x) return x def get_parameters(self) -> list: return self.conv1.get_parameters() + self.conv2.get_parameters() + self.conv3.get_parameters() + self.conv4.get_parameters() + self.conv5.get_parameters() + self.conv6.get_parameters() + self.conv7.get_parameters() def load_from_pretrained(self, intent = "art", subtype = "scale2.0x"): """ Downloads a nagadomi/waifu2x JSON weight file and loads it. """ import json data = json.loads(fetch("https://github.com/nagadomi/waifu2x/raw/master/models/vgg_7/" + intent + "/" + subtype + "_model.json").read_bytes()) self.load_waifu2x_json(data) def load_waifu2x_json(self, data: list): """ Loads weights from one of the waifu2x JSON files, i.e. waifu2x/models/vgg_7/art/noise0_model.json data (passed in) is assumed to be the output of json.load or some similar on such a file """ self.conv1.load_waifu2x_json(data[0]) self.conv2.load_waifu2x_json(data[1]) self.conv3.load_waifu2x_json(data[2]) self.conv4.load_waifu2x_json(data[3]) self.conv5.load_waifu2x_json(data[4]) self.conv6.load_waifu2x_json(data[5]) self.conv7.load_waifu2x_json(data[6]) def forward_tiled(self, image: numpy.ndarray, tile_size: int) -> numpy.ndarray: """ Given an ndarray image as loaded by image_load (NOT a tensor), scales it, pads it, splits it up, forwards the pieces, and reconstitutes it. Note that you really shouldn't try to run anything not (1, 3, *, *) through this. """ # Constant that only really gets repeated a ton here. context = 7 context2 = context + context # Notably, numpy is used here because it makes this fine manipulation a lot simpler. # Scaling first - repeat on axis 2 and axis 3 (Y & X) image = image.repeat(2, 2).repeat(2, 3) # Resulting image buffer. This is made before the input is padded, # since the input has the padded shape right now. image_out = numpy.zeros(image.shape) # Padding next. Note that this padding is done on the whole image. # Padding the tiles would lose critical context, cause seams, etc. image = numpy.pad(image, [[0, 0], [0, 0], [context, context], [context, context]], mode = "edge") # Now for tiling. # The output tile size is the usable output from an input tile (tile_size). # As such, the tiles overlap. out_tile_size = tile_size - context2 for out_y in range(0, image_out.shape[2], out_tile_size): for out_x in range(0, image_out.shape[3], out_tile_size): # Input is sourced from the same coordinates, but some stuff ought to be # noted here for future reference: # + out_x/y's equivalent position w/ the padding is out_x + context. # + The output, however, is without context. Input needs context. # + Therefore, the input rectangle is expanded on all sides by context. # + Therefore, the input position has the context subtracted again. # + Therefore: in_y = out_y in_x = out_x # not shown: in_w/in_h = tile_size (as opposed to out_tile_size) # Extract tile. # Note that numpy will auto-crop this at the bottom-right. # This will never be a problem, as tiles are specifically chosen within the padded section. tile = image[:, :, in_y:in_y + tile_size, in_x:in_x + tile_size] # Extracted tile dimensions -> output dimensions # This is important because of said cropping, otherwise it'd be interior tile size. out_h = tile.shape[2] - context2 out_w = tile.shape[3] - context2 # Process tile. tile_t = Tensor(tile) tile_fwd_t = self.forward(tile_t) # Replace tile. image_out[:, :, out_y:out_y + out_h, out_x:out_x + out_w] = tile_fwd_t.numpy() return image_out