You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
355 lines
18 KiB
355 lines
18 KiB
import unittest
|
|
from tinygrad import Device, Tensor, dtypes
|
|
from tinygrad.helpers import CI
|
|
from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
|
|
|
|
# TODO: write a clean version of this
|
|
from test.test_linearizer import helper_linearizer_opt
|
|
|
|
class TestKernelOpts(unittest.TestCase):
|
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
|
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared")
|
|
def test_local_and_grouped_reduce(self):
|
|
N = 128
|
|
Tensor.manual_seed(1882)
|
|
a = Tensor.rand(4, 4, N, N)
|
|
b = Tensor.rand(4, 4, N)
|
|
r = (b.sqrt() + ((a+1).sum(axis=3).exp()))
|
|
helper_linearizer_opt(r, [
|
|
[Opt(OptOps.LOCAL, 0, 2)],
|
|
[Opt(OptOps.LOCAL, 0, 8)],
|
|
[Opt(OptOps.LOCAL, 0, 16)], # Checking how it works with locals
|
|
[Opt(OptOps.GROUPTOP, 0, 2)],
|
|
[Opt(OptOps.GROUPTOP, 0, 32)],
|
|
[Opt(OptOps.GROUPTOP, 0, 64)], # Checking how it works with grouped reduce
|
|
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUPTOP, 0, 2)],
|
|
[Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.GROUPTOP, 0, 16)],
|
|
[Opt(OptOps.LOCAL, 0, 32), Opt(OptOps.GROUPTOP, 0, 2)],
|
|
# Checking how it works with locals + grouped reduce
|
|
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUPTOP, 0, 64)],
|
|
# Checking how it works with locals + grouped reduce + upcasts
|
|
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.UPCAST, 0, 8), Opt(OptOps.UNROLL, 1, 4)],
|
|
# many local + many group
|
|
[Opt(OptOps.GROUP, 0, 2)] * 4,
|
|
[Opt(OptOps.LOCAL, 0, 2)] * 4,
|
|
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUP, 0, 2)] * 4,
|
|
])
|
|
|
|
def test_upcasts(self):
|
|
N = 16
|
|
Tensor.manual_seed(1772)
|
|
a = Tensor.rand(N, N)
|
|
b = Tensor.rand(N, N)
|
|
r = (a+b).sqrt() * ((a+1).exp())
|
|
helper_linearizer_opt(r, [
|
|
[Opt(OptOps.UPCAST, 0, 2)],
|
|
[Opt(OptOps.UPCAST, 0, 4)],
|
|
[Opt(OptOps.UPCAST, 0, 8)], # Checking how it works with upcasts
|
|
])
|
|
|
|
def test_full_upcast(self):
|
|
Tensor.manual_seed(1772)
|
|
a = Tensor.rand(4)
|
|
b = Tensor.rand(4)
|
|
r = (a+b).sqrt() * ((a+1).exp())
|
|
helper_linearizer_opt(r, [
|
|
[Opt(OptOps.UPCAST, 0, 4)], # Checking how it works with upcasts
|
|
])
|
|
|
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
|
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared")
|
|
def test_matmul(self):
|
|
N = 128
|
|
Tensor.manual_seed(1552)
|
|
a = Tensor.rand(N, N)
|
|
b = Tensor.rand(N, N)
|
|
r = a@b
|
|
helper_linearizer_opt(r, [
|
|
[Opt(OptOps.UPCAST, 0, 2)],
|
|
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4)], # Checking how it works with upcasts
|
|
[Opt(OptOps.LOCAL, 0, 2)],
|
|
[Opt(OptOps.LOCAL, 1, 32)],
|
|
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4)],
|
|
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 32)],
|
|
[Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.LOCAL, 1, 8)], # Checking how it works with locals
|
|
[Opt(OptOps.GROUPTOP, 0, 2)],
|
|
[Opt(OptOps.GROUPTOP, 0, 32)],
|
|
[Opt(OptOps.GROUPTOP, 0, 32), Opt(OptOps.UNROLL, 0, 4)], # Checking how it works with grouped_reduce
|
|
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.LOCAL, 1, 2), Opt(OptOps.GROUPTOP, 0, 32)],
|
|
[Opt(OptOps.LOCAL, 0, 8), Opt(OptOps.GROUPTOP, 0, 32)],
|
|
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 0, 8), Opt(OptOps.GROUPTOP, 0, 4)], # Checking how it works with local+grouped_reduce
|
|
# Checking all together
|
|
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UPCAST, 0, 4),
|
|
Opt(OptOps.UPCAST, 1, 2)],
|
|
# Full global upcast + local
|
|
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UPCAST, 0, 8)],
|
|
])
|
|
|
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
|
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared")
|
|
def test_double_reduce(self):
|
|
N = 128
|
|
Tensor.manual_seed(1552)
|
|
a = Tensor.rand(8, N, 8, N)
|
|
r = a.sum(axis=(1,3))
|
|
helper_linearizer_opt(r, [
|
|
# openCL / CL=1 is 256 max threads
|
|
[Opt(OptOps.GROUPTOP, 0, 2)], [Opt(OptOps.GROUPTOP, 0, 32)],
|
|
[Opt(OptOps.GROUPTOP, 1, 2)], [Opt(OptOps.GROUPTOP, 1, 32)], # Checking how it works with 1 grouped_reduce.
|
|
[Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)],
|
|
[Opt(OptOps.GROUPTOP, 0, 16), Opt(OptOps.GROUPTOP, 1, 2)],
|
|
[Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 64)], # Checking how it works with 2 grouped_reduces.
|
|
[Opt(OptOps.GROUPTOP, 0, 16), Opt(OptOps.GROUPTOP, 1, 2), Opt(OptOps.UNROLL, 0, 4)],
|
|
[Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 32), Opt(OptOps.UNROLL, 2, 4)], # Checking how it works with 2 grouped_reduces + upcasts.
|
|
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4), Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 4)],
|
|
# Checking how it works with 2 grouped_reduces + upcasts + locals.
|
|
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4), Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 32), Opt(OptOps.UNROLL, 1, 4)],
|
|
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.LOCAL, 1, 2), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UPCAST, 0, 2)],
|
|
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.LOCAL, 1, 2), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UPCAST, 0, 2),
|
|
Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.UNROLL, 1, 4)], # Checking how it works with 2 grouped_reduces + upcasts + locals.
|
|
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.LOCAL, 1, 4), Opt(OptOps.GROUPTOP, 0, 4), Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.UPCAST, 0, 2),
|
|
Opt(OptOps.UPCAST, 0, 2)], # No globals
|
|
])
|
|
|
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
|
@unittest.skipUnless(any(tc.dtype_in == tc.dtype_out == dtypes.half for tc in Device[Device.DEFAULT].renderer.tensor_cores),
|
|
"test requires tensor cores with accumulation in half") # testing with half suffices.
|
|
def test_tensor_core_opts(self):
|
|
N = 128
|
|
Tensor.manual_seed(1552)
|
|
a, b = Tensor.rand(N, N, dtype=dtypes.half), Tensor.rand(N, N, dtype=dtypes.half)
|
|
r = a.matmul(b, dtype=dtypes.half)
|
|
atol, rtol = 0.25, 0.01
|
|
helper_linearizer_opt(r, [
|
|
[],
|
|
[Opt(OptOps.UPCAST, 0, 4)],
|
|
[Opt(OptOps.UPCAST, 1, 4)],
|
|
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4)], # check upcasts
|
|
[Opt(OptOps.UNROLL, 0, 2)], # check unroll
|
|
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 2)], # check combo of unroll and local
|
|
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UNROLL, 0, 2)],
|
|
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UNROLL, 0, 4)],
|
|
[Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UPCAST, 0, 4)], # check permutations
|
|
[Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UPCAST, 0, 4)],
|
|
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UPCAST, 1, 4)],
|
|
[Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)],
|
|
], apply_tc=True, atol=atol, rtol=rtol)
|
|
|
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
|
@unittest.skipUnless(any(tc.dtype_in == tc.dtype_out == dtypes.half for tc in Device[Device.DEFAULT].renderer.tensor_cores),
|
|
"test requires tensor cores with accumulation in half") # testing with half suffices.
|
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
|
|
def test_tensor_core_opts_locals(self):
|
|
N = 128
|
|
Tensor.manual_seed(1552)
|
|
a, b = Tensor.rand(N, N, dtype=dtypes.half), Tensor.rand(N, N, dtype=dtypes.half)
|
|
r = a.matmul(b, dtype=dtypes.half)
|
|
atol, rtol = 0.25, 0.01
|
|
helper_linearizer_opt(r, [
|
|
[Opt(OptOps.UNROLL, 0, 0)], # check full unroll of reduce with locals
|
|
[Opt(OptOps.LOCAL, 0, 4)], # check local
|
|
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.LOCAL, 0, 2)],
|
|
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.UPCAST, 1, 4), Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UPCAST, 0, 4)],
|
|
], apply_tc=True, atol=atol, rtol=rtol)
|
|
|
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.tensor_cores, "test requires tensor cores")
|
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
|
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared memory")
|
|
@unittest.skipUnless(any(tc.dtype_in == tc.dtype_out == dtypes.half for tc in Device[Device.DEFAULT].renderer.tensor_cores),
|
|
"test requires tensor cores with accumulation in half") # testing with half suffices.
|
|
# NOTE: the METAL test is broken, likely due to a compiler bug. passes on CI with -O0 and with default opt level locally on M3
|
|
@unittest.skipIf(Device.DEFAULT == "METAL", "broken for METAL")
|
|
@unittest.skip("feature was removed")
|
|
def test_tensor_core_opts_group(self):
|
|
N = 128
|
|
Tensor.manual_seed(1552)
|
|
a, b = Tensor.rand(N, N, dtype=dtypes.half), Tensor.rand(N, N, dtype=dtypes.half)
|
|
r = a.matmul(b, dtype=dtypes.half)
|
|
atol, rtol = 0.25, 0.01
|
|
helper_linearizer_opt(r, [
|
|
[Opt(OptOps.GROUP, 0, 2)],
|
|
[Opt(OptOps.GROUPTOP, 0, 4)],
|
|
[Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.GROUP, 0, 2)],
|
|
[Opt(OptOps.LOCAL, 0, 4), Opt(OptOps.GROUP, 0, 2)],
|
|
[Opt(OptOps.UNROLL, 0, 4), Opt(OptOps.GROUP, 0, 2)],
|
|
[Opt(OptOps.UPCAST, 0, 2), Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUP, 0, 2)],
|
|
[Opt(OptOps.LOCAL, 0, 2), Opt(OptOps.GROUPTOP, 0, 8), Opt(OptOps.UNROLL, 0, 2), Opt(OptOps.UPCAST, 1, 2)],
|
|
], apply_tc=True, atol=atol, rtol=rtol)
|
|
|
|
def test_padto_matmul(self):
|
|
if (CI and Device.DEFAULT in ["AMD", "NV", "CUDA"]):
|
|
self.skipTest("super slow on CUDA and AMD because of the big grid dims")
|
|
N = 17 * 17
|
|
Tensor.manual_seed(289)
|
|
a = Tensor.rand(N, N)
|
|
b = Tensor.rand(N, N)
|
|
helper_linearizer_opt(a@b, [
|
|
[Opt(OptOps.PADTO, 0, 32)],
|
|
[Opt(OptOps.PADTO, 1, 32)],
|
|
[Opt(OptOps.PADTO, 2, 32)],
|
|
[Opt(OptOps.PADTO, 0, 32), Opt(OptOps.PADTO, 1, 32)],
|
|
[Opt(OptOps.PADTO, 0, 32), Opt(OptOps.PADTO, 1, 32), Opt(OptOps.PADTO, 2, 32)],
|
|
# can optimize further post PADTO
|
|
[Opt(OptOps.PADTO, 0, 32), Opt(OptOps.PADTO, 1, 32), Opt(OptOps.UPCAST, 0, 2), Opt(OptOps.UPCAST, 1, 2),],
|
|
])
|
|
|
|
def test_padto_upcasted_not_ok(self):
|
|
N = 4
|
|
a = Tensor.rand(N, N)
|
|
b = Tensor.rand(N, N)
|
|
helper_linearizer_opt(a@b, [
|
|
[Opt(OptOps.UPCAST, 0, 0)],
|
|
[Opt(OptOps.UPCAST, 1, 0)],
|
|
[Opt(OptOps.UNROLL, 0, 0)],
|
|
[Opt(OptOps.PADTO, 0, 8)],
|
|
[Opt(OptOps.PADTO, 1, 8)],
|
|
[Opt(OptOps.PADTO, 2, 8)],
|
|
])
|
|
with self.assertRaises(KernelOptError):
|
|
helper_linearizer_opt(a@b, [[Opt(OptOps.UPCAST, 0, 0), Opt(OptOps.PADTO, 1, 8)]])
|
|
with self.assertRaises(KernelOptError):
|
|
helper_linearizer_opt(a@b, [[Opt(OptOps.UPCAST, 1, 0), Opt(OptOps.PADTO, 1, 8)]])
|
|
with self.assertRaises(KernelOptError):
|
|
helper_linearizer_opt(a@b, [[Opt(OptOps.UNROLL, 0, 0), Opt(OptOps.PADTO, 2, 8)]])
|
|
|
|
def test_padto_sum_ok(self):
|
|
N = 18 * 18
|
|
# NOTE: this setup prevents 17 * 17 contiguous merged into one dimension
|
|
a = Tensor.rand(N, N).realize().shrink(((0, 17), (0, 17))) * 100
|
|
b = (Tensor.rand(N, N) < 0.5).realize().shrink(((0, 17), (0, 17)))
|
|
|
|
helper_linearizer_opt(a.sum(0), [
|
|
[Opt(OptOps.PADTO, 0, 32)],
|
|
[Opt(OptOps.PADTO, 0, 32), Opt(OptOps.UPCAST, 0, 8),],
|
|
])
|
|
helper_linearizer_opt(a.sum(1), [
|
|
[Opt(OptOps.PADTO, 0, 32)],
|
|
[Opt(OptOps.PADTO, 0, 32), Opt(OptOps.UPCAST, 0, 8),],
|
|
])
|
|
|
|
# can pad sum reduce axis if there's no unsafe ops prior to sum
|
|
for axis in (0, 1):
|
|
helper_linearizer_opt(a.sum(), [[Opt(OptOps.PADTO, axis, 32)],])
|
|
helper_linearizer_opt(a.sum(0), [[Opt(OptOps.PADTO, axis, 32)],])
|
|
helper_linearizer_opt(b.sum(), [[Opt(OptOps.PADTO, axis, 32)],])
|
|
helper_linearizer_opt(b.sum(0), [[Opt(OptOps.PADTO, axis, 32)],])
|
|
helper_linearizer_opt(b.sum(dtype=dtypes.bool), [[Opt(OptOps.PADTO, axis, 32)],])
|
|
# TODO: why?
|
|
if Device.DEFAULT != "WEBGPU":
|
|
helper_linearizer_opt(b.sum(0, dtype=dtypes.bool), [[Opt(OptOps.PADTO, axis, 32)],])
|
|
helper_linearizer_opt(b.sum(1, dtype=dtypes.bool), [[Opt(OptOps.PADTO, axis, 32)],])
|
|
|
|
# having unsafe ops after sum is fine
|
|
helper_linearizer_opt(a.sum().exp(), [[Opt(OptOps.PADTO, 0, 32)],])
|
|
helper_linearizer_opt(a.sum(0).exp(), [[Opt(OptOps.PADTO, 1, 32)],])
|
|
|
|
def test_padto_sum_not_ok(self):
|
|
N = 18 * 18
|
|
# NOTE: this setup prevents 17 * 17 contiguous merged into one dimension
|
|
a = Tensor.rand(N, N).shrink(((0, 17), (0, 17))).exp()
|
|
# exp is not safe to pad
|
|
with self.assertRaises(KernelOptError):
|
|
helper_linearizer_opt(a.exp().sum(), [[Opt(OptOps.PADTO, 0, 32)],])
|
|
with self.assertRaises(KernelOptError):
|
|
helper_linearizer_opt(a.exp().sum(0), [[Opt(OptOps.PADTO, 1, 32)],])
|
|
|
|
b = a < 1
|
|
# lt is not safe to pad
|
|
with self.assertRaises(KernelOptError):
|
|
helper_linearizer_opt(b.sum(), [[Opt(OptOps.PADTO, 0, 32)],])
|
|
with self.assertRaises(KernelOptError):
|
|
helper_linearizer_opt(b.sum(0), [[Opt(OptOps.PADTO, 1, 32)],])
|
|
|
|
def test_padto_max(self):
|
|
N = 18 * 18
|
|
# NOTE: this setup prevents 17 * 17 contiguous merged into one axis
|
|
a = -Tensor.rand(N, N).shrink(((0, 17), (0, 17))) * 100
|
|
|
|
helper_linearizer_opt(a.max(0), [
|
|
[Opt(OptOps.PADTO, 0, 32)],
|
|
[Opt(OptOps.PADTO, 0, 32), Opt(OptOps.UPCAST, 0, 8),],
|
|
])
|
|
helper_linearizer_opt(a.max(1), [
|
|
[Opt(OptOps.PADTO, 0, 32)],
|
|
[Opt(OptOps.PADTO, 0, 32), Opt(OptOps.UPCAST, 0, 8),],
|
|
])
|
|
|
|
# cannot pad max kernel on reduce
|
|
with self.assertRaises(KernelOptError):
|
|
helper_linearizer_opt(a.max(), [[Opt(OptOps.PADTO, 0, 32)],])
|
|
with self.assertRaises(KernelOptError):
|
|
helper_linearizer_opt(a.max(0), [[Opt(OptOps.PADTO, 1, 32)],])
|
|
|
|
def test_padto_where(self):
|
|
Tensor.manual_seed(0)
|
|
N = 17 * 17
|
|
a = (Tensor.randn(N, N).realize().max(axis=0, keepdim=True) > 1).where(1, 0)
|
|
helper_linearizer_opt(a.max(0), [
|
|
[Opt(OptOps.PADTO, 0, 32)],
|
|
[Opt(OptOps.PADTO, 0, 32), Opt(OptOps.UPCAST, 0, 8),],
|
|
])
|
|
|
|
def test_padto_where_multioutput(self):
|
|
Tensor.manual_seed(0)
|
|
N = 17 * 17
|
|
r = Tensor.randn(N, N).realize().max(axis=0, keepdim=True) > 1
|
|
a0 = r.where(1, 0)
|
|
a1 = r.where(2, 0)
|
|
helper_linearizer_opt([a0.max(0), a1.max(0)], [
|
|
[Opt(OptOps.PADTO, 0, 32)],
|
|
[Opt(OptOps.PADTO, 0, 32), Opt(OptOps.UPCAST, 0, 8),],
|
|
])
|
|
|
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
|
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared")
|
|
def test_color_shapes_with_local(self):
|
|
N = 32
|
|
Tensor.manual_seed(1552)
|
|
a = Tensor.rand(N, N)
|
|
b = Tensor.rand(N, N)
|
|
r = a@b
|
|
opts_shapes = [
|
|
([Opt(OptOps.LOCAL, 0, 2)], [("blue",16),("blue",32),("cyan",2),("red",32)]),
|
|
([Opt(OptOps.LOCAL, 0, 2),Opt(OptOps.GROUP, 0, 2)], [("blue",16),("blue",32),("cyan",2),("green",2),("red",16)]),
|
|
# check to ensure local_dims are stable for full UNROLL of the first reduce
|
|
([Opt(OptOps.LOCAL, 0, 2),Opt(OptOps.UNROLL, 0, 0)], [("blue",16),("blue",32),("cyan",2),("magenta",32)]),
|
|
([Opt(OptOps.UNROLL, 0, 0),Opt(OptOps.LOCAL, 0, 2)], [("blue",16),("blue",32),("cyan",2),("magenta",32)]),
|
|
# check behavior for full UNROLL on an existing GROUP
|
|
([Opt(OptOps.LOCAL, 0, 2),Opt(OptOps.GROUP, 0, 0),Opt(OptOps.UNROLL, 0, 2)], [("blue",16),("blue",32),("cyan",2),("green",16),("magenta",2)]),
|
|
([Opt(OptOps.LOCAL, 0, 2),Opt(OptOps.GROUP, 0, 0),Opt(OptOps.UNROLL, 0, 0)], [("blue",16),("blue",32),("cyan",2),("magenta",32)]),
|
|
([Opt(OptOps.GROUP, 0, 0),Opt(OptOps.LOCAL, 0, 2),Opt(OptOps.UNROLL, 0, 0)], [("blue",16),("blue",32),("cyan",2),("magenta",32)]),
|
|
([Opt(OptOps.GROUP, 0, 2),Opt(OptOps.UNROLL, 0, 0)], [("blue",32),("blue",32),("red",16),("magenta",2)]),
|
|
]
|
|
helper_linearizer_opt(r, [x[0] for x in opts_shapes], color_sizes=[x[1] for x in opts_shapes])
|
|
|
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
|
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared")
|
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "test requires float4")
|
|
def test_arange_opts(self):
|
|
a = Tensor.arange(128)
|
|
# NOTE: arange no longer has reduce ops available for opt
|
|
helper_linearizer_opt(a, [
|
|
#[Opt(OptOps.GROUP, 0, 32)],
|
|
#[Opt(OptOps.GROUPTOP, 0, 32)],
|
|
[Opt(op=OptOps.LOCAL, axis=0, arg=8)],
|
|
[Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=0, arg=0)],
|
|
#[Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=8)],
|
|
#[Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=8), Opt(op=OptOps.UNROLL, axis=1, arg=4)], # noqa: E501
|
|
])
|
|
|
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_threads, "test requires threads")
|
|
@unittest.skipUnless(Device[Device.DEFAULT].renderer.global_max is not None and
|
|
Device[Device.DEFAULT].renderer.global_max[0] > 1, "test requires multicore")
|
|
def test_thread_opts(self):
|
|
a = Tensor.rand(4, 4, 4, 4)
|
|
b = Tensor.rand(4, 4, 4)
|
|
r = (b.sqrt() + ((a+1).sum(axis=3).exp()))
|
|
helper_linearizer_opt(r, [
|
|
[Opt(OptOps.THREAD, 0, 2)],
|
|
[Opt(OptOps.UPCAST, 0, 2), Opt(OptOps.THREAD, 0, 2)],
|
|
[Opt(OptOps.UPCAST, 0, 2), Opt(OptOps.THREAD, 0, 2), Opt(OptOps.UNROLL, 0, 2)],
|
|
] + [[Opt(OptOps.THREAD, 0, 4)] if Device[Device.DEFAULT].renderer.global_max[0] >= 4 else []]
|
|
+ [[Opt(OptOps.THREAD, 0, 8)] if Device[Device.DEFAULT].renderer.global_max[0] >= 8 else []])
|
|
|
|
if __name__ == '__main__':
|
|
unittest.main()
|
|
|