commit
1885c9e2ba
99 changed files with 1539 additions and 421 deletions
@ -1 +1 @@ |
||||
Subproject commit 03860ae0b2b8128cae7768e4301d889e627c9275 |
||||
Subproject commit 28d458a9af49b38bd0a9052f09fbe927324320fb |
@ -0,0 +1,3 @@ |
||||
version https://git-lfs.github.com/spec/v1 |
||||
oid sha256:ba3fe3e61853cc1434e3e220f40c8e9d1f1b9bab8458196ba3bea6a10b82c6ed |
||||
size 72718099 |
@ -0,0 +1,3 @@ |
||||
version https://git-lfs.github.com/spec/v1 |
||||
oid sha256:bda57c1a66944f5a633ecd739a24d62702c717a234f2fdcc499dfa1d61c3c19e |
||||
size 73147489 |
@ -1 +1 @@ |
||||
Subproject commit c3d3c71aa7a37364814f029778070fcb550c7cd3 |
||||
Subproject commit 46a942d6790531cf5b94b14266140e43afcfda3e |
@ -1 +1 @@ |
||||
Subproject commit f56ebf5b776b677bf12ec772b0223274dd798999 |
||||
Subproject commit 51ccb9fbd266796e1bf6ffda8b93c4119ab09ff4 |
@ -1 +1 @@ |
||||
#define COMMA_VERSION "0.8.13" |
||||
#define COMMA_VERSION "0.8.14" |
||||
|
@ -0,0 +1,39 @@ |
||||
#!/usr/bin/env python3 |
||||
import argparse |
||||
import pandas as pd # pylint: disable=import-error |
||||
|
||||
import cereal.messaging as messaging |
||||
|
||||
|
||||
if __name__ == "__main__": |
||||
parser = argparse.ArgumentParser(description="Cabana-like table of bits for your terminal", |
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter) |
||||
parser.add_argument("addr", type=str, nargs=1) |
||||
parser.add_argument("bus", type=int, default=0, nargs='?') |
||||
|
||||
args = parser.parse_args() |
||||
|
||||
addr = int(args.addr[0], 0) |
||||
can = messaging.sub_sock('can', conflate=False, timeout=None) |
||||
|
||||
print(f"waiting for {hex(addr)} ({addr}) on bus {args.bus}...") |
||||
|
||||
latest = None |
||||
while True: |
||||
for msg in messaging.drain_sock(can, wait_for_one=True): |
||||
for m in msg.can: |
||||
if m.address == addr and m.src == args.bus: |
||||
latest = m |
||||
|
||||
if latest is None: |
||||
continue |
||||
|
||||
rows = [] |
||||
for b in latest.dat: |
||||
r = list(bin(b).lstrip('0b').zfill(8)) |
||||
r += [hex(b)] |
||||
rows.append(r) |
||||
|
||||
df = pd.DataFrame(data=rows) |
||||
table = df.to_markdown(tablefmt='grid') |
||||
print(f"\n\n{hex(addr)} ({addr}) on bus {args.bus}\n{table}") |
@ -0,0 +1,30 @@ |
||||
#!/usr/bin/env python3 |
||||
import sys |
||||
from subprocess import check_output, CalledProcessError |
||||
from panda import Panda |
||||
from panda.python.uds import UdsClient, MessageTimeoutError, SESSION_TYPE, DTC_GROUP_TYPE |
||||
|
||||
try: |
||||
check_output(["pidof", "boardd"]) |
||||
print("boardd is running, please kill openpilot before running this script! (aborted)") |
||||
sys.exit(1) |
||||
except CalledProcessError as e: |
||||
if e.returncode != 1: # 1 == no process found (boardd not running) |
||||
raise e |
||||
|
||||
panda = Panda() |
||||
panda.set_safety_mode(Panda.SAFETY_ELM327) |
||||
address = 0x7DF # functional (broadcast) address |
||||
uds_client = UdsClient(panda, address, bus=0, debug=False) |
||||
print("extended diagnostic session ...") |
||||
try: |
||||
uds_client.diagnostic_session_control(SESSION_TYPE.EXTENDED_DIAGNOSTIC) |
||||
except MessageTimeoutError: |
||||
pass # functional address isn't properly handled so a timeout occurs |
||||
print("clear diagnostic info ...") |
||||
try: |
||||
uds_client.clear_diagnostic_information(DTC_GROUP_TYPE.ALL) |
||||
except MessageTimeoutError: |
||||
pass # functional address isn't properly handled so a timeout occurs |
||||
print("") |
||||
print("you may need to power cycle your vehicle now") |
@ -0,0 +1,272 @@ |
||||
read_only image2d_t input, |
||||
#ifndef DEPTHWISE |
||||
short startPackedInputChannel, |
||||
short numPackedInputChannelsForGroup, short totalNumPackedInputChannels, |
||||
// typo required for API compatibility |
||||
short packedOuputChannelOffset, short totalNumPackedOutputChannels, |
||||
#else |
||||
short totalNumPackedChannels, |
||||
#endif |
||||
read_only image2d_t weights, __constant float *biases, |
||||
short filterSizeX, short filterSizeY, |
||||
write_only image2d_t output, |
||||
short paddingX, short paddingY, short strideX, short strideY, |
||||
#ifdef SUPPORT_DILATION |
||||
short dilationX, short dilationY, |
||||
#endif |
||||
short neuron, float a, float b, float min_clamp, float max_clamp, |
||||
#ifndef DEPTHWISE |
||||
// note: these are not supported |
||||
__constant float *parameters, __constant float *batchNormBiases, |
||||
#endif |
||||
short numOutputColumns |
||||
#ifdef SUPPORT_ACCUMULATION |
||||
, short doAccumulate, read_only image2d_t accumulator |
||||
#endif |
||||
) { |
||||
|
||||
#ifndef NUM_OUTPUTS |
||||
#define NUM_OUTPUTS 4 |
||||
#endif |
||||
|
||||
// init |
||||
const sampler_t smp = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; |
||||
short packedOutputChannel = get_global_id(0); |
||||
short startOutputColumn = mul24((short)get_global_id(1), NUM_OUTPUTS); |
||||
short outputRow = get_global_id(2); |
||||
|
||||
#ifdef DEPTHWISE |
||||
short totalNumPackedInputChannels = totalNumPackedChannels; |
||||
short totalNumPackedOutputChannels = totalNumPackedChannels; |
||||
short startPackedInputChannel = packedOutputChannel; |
||||
#endif |
||||
|
||||
short startX = mad24(mad24(startOutputColumn, strideX, -paddingX), totalNumPackedInputChannels, startPackedInputChannel); |
||||
short strideWithChannels = mul24(strideX, totalNumPackedInputChannels); |
||||
|
||||
float4 outputValues[NUM_OUTPUTS]; |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] = (float4)(0, 0, 0, 0); |
||||
} |
||||
|
||||
int2 inputLocation; |
||||
inputLocation.y = mad24(outputRow, strideY, -paddingY); |
||||
|
||||
int2 weightLocation; |
||||
weightLocation.x = 0; |
||||
weightLocation.y = packedOutputChannel; |
||||
|
||||
#ifdef DEPTHWISE |
||||
|
||||
#ifdef SUPPORT_DILATION |
||||
|
||||
// depthwise convolution |
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { |
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { |
||||
short dilatedStepX = mul24(totalNumPackedChannels, dilationX); |
||||
inputLocation.x = mad24(rfColumn, dilatedStepX, startX); |
||||
float4 inputValues[4]; |
||||
for (short i = 0; i < 4; ++i) { |
||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += strideWithChannels; |
||||
} |
||||
float4 weightValues = read_imagef(weights, smp, weightLocation); |
||||
++weightLocation.x; |
||||
outputValues[0] += inputValues[0] * weightValues; |
||||
outputValues[1] += inputValues[1] * weightValues; |
||||
outputValues[2] += inputValues[2] * weightValues; |
||||
outputValues[3] += inputValues[3] * weightValues; |
||||
} |
||||
inputLocation.y += dilationY; |
||||
} |
||||
|
||||
#else |
||||
|
||||
// depthwise unstrided convolution |
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { |
||||
float4 inputValues[4]; |
||||
inputLocation.x = startX; |
||||
for (short i = 1; i < 4; ++i) { |
||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += totalNumPackedOutputChannels; |
||||
} |
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { |
||||
inputValues[0] = inputValues[1]; |
||||
inputValues[1] = inputValues[2]; |
||||
inputValues[2] = inputValues[3]; |
||||
inputValues[3] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += totalNumPackedChannels; |
||||
float4 weightValues = read_imagef(weights, smp, weightLocation); |
||||
++weightLocation.x; |
||||
outputValues[0] += inputValues[0] * weightValues; |
||||
outputValues[1] += inputValues[1] * weightValues; |
||||
outputValues[2] += inputValues[2] * weightValues; |
||||
outputValues[3] += inputValues[3] * weightValues; |
||||
} |
||||
++inputLocation.y; |
||||
} |
||||
|
||||
#endif |
||||
|
||||
#elif defined(ONLY_1X1_CONV) |
||||
|
||||
// 1x1 convolution |
||||
short endPackedInputChannel = startPackedInputChannel + numPackedInputChannelsForGroup; |
||||
for (short packedInputChannel = startPackedInputChannel; packedInputChannel < endPackedInputChannel; ++packedInputChannel) { |
||||
float4 weightValues[4]; |
||||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { |
||||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); |
||||
++weightLocation.x; |
||||
} |
||||
|
||||
inputLocation.x = startX + packedInputChannel; |
||||
float4 inputValues[NUM_OUTPUTS]; |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += strideWithChannels; |
||||
} |
||||
|
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
float4 curOutputValues = outputValues[i]; |
||||
curOutputValues.x += inputValues[i].x * weightValues[0].x; |
||||
curOutputValues.x += inputValues[i].y * weightValues[0].y; |
||||
curOutputValues.x += inputValues[i].z * weightValues[0].z; |
||||
curOutputValues.x += inputValues[i].w * weightValues[0].w; |
||||
curOutputValues.y += inputValues[i].x * weightValues[1].x; |
||||
curOutputValues.y += inputValues[i].y * weightValues[1].y; |
||||
curOutputValues.y += inputValues[i].z * weightValues[1].z; |
||||
curOutputValues.y += inputValues[i].w * weightValues[1].w; |
||||
curOutputValues.z += inputValues[i].x * weightValues[2].x; |
||||
curOutputValues.z += inputValues[i].y * weightValues[2].y; |
||||
curOutputValues.z += inputValues[i].z * weightValues[2].z; |
||||
curOutputValues.z += inputValues[i].w * weightValues[2].w; |
||||
curOutputValues.w += inputValues[i].x * weightValues[3].x; |
||||
curOutputValues.w += inputValues[i].y * weightValues[3].y; |
||||
curOutputValues.w += inputValues[i].z * weightValues[3].z; |
||||
curOutputValues.w += inputValues[i].w * weightValues[3].w; |
||||
outputValues[i] = curOutputValues; |
||||
} |
||||
} |
||||
packedOutputChannel += packedOuputChannelOffset; |
||||
|
||||
#else |
||||
|
||||
// normal convolution |
||||
for (short rfRow = 0; rfRow < filterSizeY; ++rfRow) { |
||||
for (short packedInputChannel = 0; packedInputChannel < numPackedInputChannelsForGroup; ++packedInputChannel) { |
||||
short startXForChannel = startX + packedInputChannel; |
||||
for (short rfColumn = 0; rfColumn < filterSizeX; ++rfColumn) { |
||||
|
||||
float4 weightValues[4]; |
||||
for (short outChIdx = 0; outChIdx < 4; ++outChIdx) { |
||||
weightValues[outChIdx] = read_imagef(weights, smp, weightLocation); |
||||
++weightLocation.x; |
||||
} |
||||
|
||||
#ifdef SUPPORT_DILATION |
||||
short dilatedStepX = mul24(totalNumPackedInputChannels, dilationX); |
||||
inputLocation.x = mad24(rfColumn, dilatedStepX, startXForChannel); |
||||
#else |
||||
inputLocation.x = mad24(rfColumn, totalNumPackedInputChannels, startXForChannel); |
||||
#endif |
||||
float4 inputValues[NUM_OUTPUTS]; |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
inputValues[i] = read_imagef(input, smp, inputLocation); |
||||
inputLocation.x += strideWithChannels; |
||||
} |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
float4 curOutputValues = outputValues[i]; |
||||
curOutputValues.x += inputValues[i].x * weightValues[0].x; |
||||
curOutputValues.x += inputValues[i].y * weightValues[0].y; |
||||
curOutputValues.x += inputValues[i].z * weightValues[0].z; |
||||
curOutputValues.x += inputValues[i].w * weightValues[0].w; |
||||
curOutputValues.y += inputValues[i].x * weightValues[1].x; |
||||
curOutputValues.y += inputValues[i].y * weightValues[1].y; |
||||
curOutputValues.y += inputValues[i].z * weightValues[1].z; |
||||
curOutputValues.y += inputValues[i].w * weightValues[1].w; |
||||
curOutputValues.z += inputValues[i].x * weightValues[2].x; |
||||
curOutputValues.z += inputValues[i].y * weightValues[2].y; |
||||
curOutputValues.z += inputValues[i].z * weightValues[2].z; |
||||
curOutputValues.z += inputValues[i].w * weightValues[2].w; |
||||
curOutputValues.w += inputValues[i].x * weightValues[3].x; |
||||
curOutputValues.w += inputValues[i].y * weightValues[3].y; |
||||
curOutputValues.w += inputValues[i].z * weightValues[3].z; |
||||
curOutputValues.w += inputValues[i].w * weightValues[3].w; |
||||
outputValues[i] = curOutputValues; |
||||
} |
||||
} |
||||
} |
||||
#ifdef SUPPORT_DILATION |
||||
inputLocation.y += dilationY; |
||||
#else |
||||
++inputLocation.y; |
||||
#endif |
||||
} |
||||
packedOutputChannel += packedOuputChannelOffset; |
||||
#endif |
||||
|
||||
// bias |
||||
short outputChannel = mul24(packedOutputChannel, 4); |
||||
float4 biasValues = vload4(0, biases + outputChannel); |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] += biasValues; |
||||
} |
||||
|
||||
#ifdef SUPPORT_ACCUMULATION |
||||
// accumulate |
||||
if (doAccumulate) { |
||||
int2 outputLocation; |
||||
short outputColumn = startOutputColumn; |
||||
outputLocation.y = outputRow; |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); |
||||
if (outputColumn < numOutputColumns) { |
||||
outputValues[i] += read_imagef(accumulator, smp, outputLocation); |
||||
} |
||||
++outputColumn; |
||||
} |
||||
} |
||||
#endif |
||||
|
||||
// activation |
||||
switch (neuron) { |
||||
case 1: |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] = max(outputValues[i], 0.0f); |
||||
} |
||||
break; |
||||
case 2: |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] = a * tanh(b * outputValues[i]); |
||||
} |
||||
break; |
||||
case 3: |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] = native_recip(1.0f + native_exp(-a * outputValues[i] + b)); |
||||
} |
||||
break; |
||||
case 4: |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] = max(outputValues[i], min_clamp); |
||||
outputValues[i] = min(outputValues[i], max_clamp); |
||||
} |
||||
break; |
||||
case 5: |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputValues[i] = max(outputValues[i], 0.0f) + a * (native_exp(min(outputValues[i], 0.0f)) - 1.0f); |
||||
} |
||||
break; |
||||
} |
||||
|
||||
// output |
||||
int2 outputLocation; |
||||
short outputColumn = startOutputColumn; |
||||
outputLocation.y = outputRow; |
||||
for (short i = 0; i < NUM_OUTPUTS; ++i) { |
||||
outputLocation.x = mad24(outputColumn, totalNumPackedOutputChannels, packedOutputChannel); |
||||
if (outputColumn < numOutputColumns) { |
||||
write_imagef(output, outputLocation, outputValues[i]); |
||||
} |
||||
++outputColumn; |
||||
} |
||||
} |
@ -0,0 +1,4 @@ |
||||
#define SUPPORT_DILATION |
||||
|
||||
__kernel void convolution_horizontal_reduced_reads( |
||||
#include "convolution_.cl" |
@ -0,0 +1,5 @@ |
||||
#define ONLY_1X1_CONV |
||||
#define SUPPORT_ACCUMULATION |
||||
|
||||
__kernel void convolution_horizontal_reduced_reads_1x1( |
||||
#include "convolution_.cl" |
@ -0,0 +1,4 @@ |
||||
#define NUM_OUTPUTS 5 |
||||
|
||||
__kernel void convolution_horizontal_reduced_reads_5_outputs( |
||||
#include "convolution_.cl" |
@ -0,0 +1,5 @@ |
||||
#define DEPTHWISE |
||||
#define SUPPORT_DILATION |
||||
|
||||
__kernel void convolution_horizontal_reduced_reads_depthwise( |
||||
#include "convolution_.cl" |
@ -0,0 +1,4 @@ |
||||
#define DEPTHWISE |
||||
|
||||
__kernel void convolution_horizontal_reduced_reads_depthwise_stride_1( |
||||
#include "convolution_.cl" |
@ -0,0 +1,266 @@ |
||||
#include <map> |
||||
#include <string> |
||||
#include <string.h> |
||||
#include <assert.h> |
||||
#include "thneed.h" |
||||
|
||||
extern map<cl_program, string> g_program_source; |
||||
|
||||
static int is_same_size_image(cl_mem a, cl_mem b) { |
||||
size_t a_width, a_height, a_depth, a_array_size, a_row_pitch, a_slice_pitch; |
||||
clGetImageInfo(a, CL_IMAGE_WIDTH, sizeof(a_width), &a_width, NULL); |
||||
clGetImageInfo(a, CL_IMAGE_HEIGHT, sizeof(a_height), &a_height, NULL); |
||||
clGetImageInfo(a, CL_IMAGE_DEPTH, sizeof(a_depth), &a_depth, NULL); |
||||
clGetImageInfo(a, CL_IMAGE_ARRAY_SIZE, sizeof(a_array_size), &a_array_size, NULL); |
||||
clGetImageInfo(a, CL_IMAGE_ROW_PITCH, sizeof(a_row_pitch), &a_row_pitch, NULL); |
||||
clGetImageInfo(a, CL_IMAGE_SLICE_PITCH, sizeof(a_slice_pitch), &a_slice_pitch, NULL); |
||||
|
||||
size_t b_width, b_height, b_depth, b_array_size, b_row_pitch, b_slice_pitch; |
||||
clGetImageInfo(b, CL_IMAGE_WIDTH, sizeof(b_width), &b_width, NULL); |
||||
clGetImageInfo(b, CL_IMAGE_HEIGHT, sizeof(b_height), &b_height, NULL); |
||||
clGetImageInfo(b, CL_IMAGE_DEPTH, sizeof(b_depth), &b_depth, NULL); |
||||
clGetImageInfo(b, CL_IMAGE_ARRAY_SIZE, sizeof(b_array_size), &b_array_size, NULL); |
||||
clGetImageInfo(b, CL_IMAGE_ROW_PITCH, sizeof(b_row_pitch), &b_row_pitch, NULL); |
||||
clGetImageInfo(b, CL_IMAGE_SLICE_PITCH, sizeof(b_slice_pitch), &b_slice_pitch, NULL); |
||||
|
||||
return (a_width == b_width) && (a_height == b_height) && |
||||
(a_depth == b_depth) && (a_array_size == b_array_size) && |
||||
(a_row_pitch == b_row_pitch) && (a_slice_pitch == b_slice_pitch); |
||||
} |
||||
|
||||
static cl_mem make_image_like(cl_context context, cl_mem val) { |
||||
cl_image_format format; |
||||
size_t width, height, row_pitch; |
||||
clGetImageInfo(val, CL_IMAGE_FORMAT, sizeof(format), &format, NULL); |
||||
assert(format.image_channel_order == CL_RGBA); |
||||
assert(format.image_channel_data_type == CL_HALF_FLOAT); |
||||
clGetImageInfo(val, CL_IMAGE_WIDTH, sizeof(width), &width, NULL); |
||||
clGetImageInfo(val, CL_IMAGE_HEIGHT, sizeof(height), &height, NULL); |
||||
clGetImageInfo(val, CL_IMAGE_ROW_PITCH, sizeof(row_pitch), &row_pitch, NULL); |
||||
|
||||
cl_image_desc desc = {0}; |
||||
desc.image_type = CL_MEM_OBJECT_IMAGE2D; |
||||
desc.image_width = width; |
||||
desc.image_height = height; |
||||
desc.image_row_pitch = row_pitch; |
||||
|
||||
cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE, row_pitch*height, NULL, NULL); |
||||
assert(buf != NULL); |
||||
desc.buffer = buf; |
||||
|
||||
cl_int err; |
||||
cl_mem tmp = clCreateImage(context, CL_MEM_READ_WRITE, &format, &desc, NULL, &err); |
||||
//printf("got %d for image %zux%zu %zu\n", err, width, height, row_pitch);
|
||||
assert(tmp != NULL); |
||||
|
||||
return tmp; |
||||
} |
||||
|
||||
// convolution_horizontal_reduced_reads_1x1 is 66% of the model runtime
|
||||
// make that faster and the model gets faster
|
||||
|
||||
// this cuts ~2 ms off the model runtime right now
|
||||
int Thneed::optimize() { |
||||
const char *kernel_path = getenv("KERNEL_PATH"); |
||||
if (!kernel_path) { kernel_path = "/data/openpilot/selfdrive/modeld/thneed/kernels"; printf("no KERNEL_PATH set, defaulting to %s\n", kernel_path); } |
||||
// load custom kernels
|
||||
map<string, cl_program> g_programs; |
||||
for (auto &k : kq) { |
||||
// replace program?
|
||||
if (g_programs.find(k->name) == g_programs.end()) { |
||||
char fn[0x100]; |
||||
snprintf(fn, sizeof(fn), "%s/%s.cl", kernel_path, k->name.c_str()); |
||||
FILE *g = fopen(fn, "rb"); |
||||
if (g != NULL) { |
||||
char *src[0x10000]; |
||||
const char *srcs[1]; srcs[0] = (const char *)src; |
||||
memset(src, 0, sizeof(src)); |
||||
size_t length = fread(src, 1, sizeof(src), g); |
||||
fclose(g); |
||||
|
||||
printf("building kernel %s\n", k->name.c_str()); |
||||
k->program = clCreateProgramWithSource(context, 1, srcs, &length, NULL); |
||||
char options[0x100]; |
||||
snprintf(options, sizeof(options)-1, "-I %s", kernel_path); |
||||
int err = clBuildProgram(k->program, 1, &device_id, options, NULL, NULL); |
||||
|
||||
if (err != 0) { |
||||
printf("got err %d\n", err); |
||||
size_t err_length; |
||||
char buffer[2048]; |
||||
clGetProgramBuildInfo(k->program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &err_length); |
||||
buffer[err_length] = '\0'; |
||||
printf("%s\n", buffer); |
||||
} |
||||
assert(err == 0); |
||||
|
||||
// save in cache
|
||||
g_programs[k->name] = k->program; |
||||
g_program_source[k->program] = string((char *)src, length); |
||||
} else { |
||||
g_programs[k->name] = NULL; |
||||
} |
||||
} else { |
||||
// cached replacement
|
||||
if (g_programs[k->name] != NULL) { |
||||
k->program = g_programs[k->name]; |
||||
} |
||||
} |
||||
|
||||
// hack in accumulator to convolution_horizontal_reduced_reads_1x1
|
||||
if (k->name == "convolution_horizontal_reduced_reads_1x1") { |
||||
k->arg_names.push_back("doAccumulate"); |
||||
short doAccumulate = 0; |
||||
k->args.push_back(string((char *)&doAccumulate, sizeof(doAccumulate))); |
||||
k->args_size.push_back(2); |
||||
k->arg_names.push_back("accumulator"); |
||||
k->args.push_back(k->args[k->get_arg_num("output")]); |
||||
k->args_size.push_back(8); |
||||
k->num_args += 2; |
||||
} |
||||
|
||||
// assert that parameters + batchNormBiases are not used
|
||||
// since they aren't supported in custom replacement kernels
|
||||
if (k->name == "convolution_horizontal_reduced_reads_1x1" || |
||||
k->name == "convolution_horizontal_reduced_reads" || |
||||
k->name == "convolution_horizontal_reduced_reads_5_outputs") { |
||||
string p1 = k->args[k->get_arg_num("parameters")]; |
||||
string p2 = k->args[k->get_arg_num("batchNormBiases")]; |
||||
assert(p1.length() == 8 && *((uint64_t*)p1.data()) == 0); |
||||
assert(p2.length() == 8 && *((uint64_t*)p2.data()) == 0); |
||||
} |
||||
} |
||||
|
||||
// optimizer
|
||||
size_t start_size; |
||||
do { |
||||
start_size = kq.size(); |
||||
|
||||
// get optimizations
|
||||
map<string, string> replacements; |
||||
for (int i = 0; i < kq.size(); i++) { |
||||
// fusing elementwise_sum + activate_image will save 3 enqueues
|
||||
|
||||
// delete useless copy layers
|
||||
// saves ~0.7 ms
|
||||
if (kq[i]->name == "concatenation" || kq[i]->name == "flatten") { |
||||
string in = kq[i]->args[kq[i]->get_arg_num("input")]; |
||||
string out = kq[i]->args[kq[i]->get_arg_num("output")]; |
||||
if (is_same_size_image(*(cl_mem*)in.data(), *(cl_mem*)out.data())) { |
||||
cl_mem tmp = make_image_like(context, *(cl_mem *)in.data()); |
||||
replacements[in] = string((char *)&tmp, sizeof(tmp)); |
||||
replacements[out] = string((char *)&tmp, sizeof(tmp)); |
||||
|
||||
kq.erase(kq.begin()+i); --i; |
||||
} |
||||
} |
||||
|
||||
// NOTE: if activations/accumulation are done in the wrong order, this will be wrong
|
||||
|
||||
// fuse activations into convs and fc_Wtx
|
||||
// saves ~1.5 ms
|
||||
// NOTE: this changes the outputs because of rounding, should be better now!
|
||||
if (i != 0 && kq[i]->name == "activate_image") { |
||||
if (kq[i-1]->name == "convolution_horizontal_reduced_reads_1x1" || |
||||
kq[i-1]->name == "convolution_horizontal_reduced_reads_5_outputs" || |
||||
kq[i-1]->name == "convolution_horizontal_reduced_reads" || |
||||
kq[i-1]->name == "convolution_horizontal_reduced_reads_depthwise" || |
||||
kq[i-1]->name == "convolution_horizontal_reduced_reads_depthwise_stride_1" || |
||||
kq[i-1]->name == "fc_Wtx") { |
||||
string lastout = kq[i-1]->args[kq[i-1]->get_arg_num("output")]; |
||||
string in = kq[i]->args[kq[i]->get_arg_num("input")]; |
||||
string out = kq[i]->args[kq[i]->get_arg_num("output")]; |
||||
|
||||
if (lastout == in) { |
||||
short neuron = *(int*)kq[i]->args[kq[i]->get_arg_num("neuron")].data(); |
||||
assert(neuron <= 5); |
||||
|
||||
// ELU isn't supported in fc_Wtx
|
||||
assert(!(kq[i-1]->name == "fc_Wtx" && neuron == 5)); |
||||
|
||||
kq[i-1]->args[kq[i-1]->get_arg_num("neuron")] = string((char *)&neuron, sizeof(neuron)); |
||||
|
||||
cl_mem tmp = make_image_like(context, *(cl_mem *)lastout.data()); |
||||
replacements[in] = string((char *)&tmp, sizeof(tmp)); |
||||
replacements[out] = string((char *)&tmp, sizeof(tmp)); |
||||
|
||||
kq.erase(kq.begin()+i); --i; |
||||
} |
||||
} |
||||
} |
||||
|
||||
// fuse accumulation into convs and fc_Wtx
|
||||
if (i != 0 && kq[i]->name == "elementwise_sum") { |
||||
if (kq[i-1]->name == "convolution_horizontal_reduced_reads_1x1" || |
||||
kq[i-1]->name == "fc_Wtx") { |
||||
string lastout = kq[i-1]->args[kq[i-1]->get_arg_num("output")]; |
||||
string a = kq[i]->args[kq[i]->get_arg_num("a")]; |
||||
string b = kq[i]->args[kq[i]->get_arg_num("b")]; |
||||
string out = kq[i]->args[kq[i]->get_arg_num("output")]; |
||||
|
||||
if (lastout == a) { |
||||
kq[i-1]->args[kq[i-1]->get_arg_num("accumulator")] = b; |
||||
} else if (lastout == b) { |
||||
kq[i-1]->args[kq[i-1]->get_arg_num("accumulator")] = a; |
||||
} else { |
||||
continue; |
||||
} |
||||
|
||||
cl_mem tmp = make_image_like(context, *(cl_mem *)lastout.data()); |
||||
replacements[lastout] = string((char *)&tmp, sizeof(tmp)); |
||||
replacements[out] = string((char *)&tmp, sizeof(tmp)); |
||||
|
||||
short doAccumulate = 1; |
||||
kq[i-1]->args[kq[i-1]->get_arg_num("doAccumulate")] = string((char *)&doAccumulate, sizeof(doAccumulate)); |
||||
|
||||
kq.erase(kq.begin()+i); --i; |
||||
} |
||||
} |
||||
} |
||||
|
||||
// remap inputs and outputs, and clear the kernels
|
||||
for (int i = 0; i < kq.size(); i++) { |
||||
kq[i]->kernel = NULL; |
||||
for (int j = 0; j < kq[i]->num_args; j++) { |
||||
if (replacements.find(kq[i]->args[j]) != replacements.end()) { |
||||
kq[i]->args[j] = replacements[kq[i]->args[j]]; |
||||
} |
||||
} |
||||
} |
||||
|
||||
printf("optimize %lu -> %lu\n", start_size, kq.size()); |
||||
} while (kq.size() != start_size); |
||||
|
||||
size_t work_group_size = 0; |
||||
clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(work_group_size), &work_group_size, NULL); |
||||
printf("max work group size %lu\n", work_group_size); |
||||
|
||||
// local work group optimizer
|
||||
for (auto &k : kq) { |
||||
// only do it for convs, since others might share memory
|
||||
if (k->name.rfind("convolution_", 0) == 0) { |
||||
int best = -1; |
||||
if (k->local_work_size[0] * k->local_work_size[1] * k->local_work_size[2] < work_group_size/2) { |
||||
uint64_t base_time = k->benchmark(); |
||||
uint64_t best_time = base_time; |
||||
for (int i = 0; i < 3; i++) { |
||||
k->local_work_size[i] *= 2; |
||||
uint64_t this_time = k->benchmark(); |
||||
if (this_time < best_time) { |
||||
best = i; |
||||
best_time = this_time; |
||||
} |
||||
k->local_work_size[i] /= 2; |
||||
} |
||||
if (best != -1) { |
||||
k->local_work_size[best] *= 2; |
||||
//printf("%s %.2f ms doubled %d to %.2f ms\n", k->name.c_str(), base_time/1e6, best, best_time/1e6);
|
||||
} |
||||
} |
||||
|
||||
} |
||||
} |
||||
|
||||
return 0; |
||||
} |
||||
|
@ -1 +1 @@ |
||||
7d3ad941bc4ba4c923af7a1d7b48544bfc0d3e13 |
||||
19720e79b1c5136a882efd689651d9044e2e2007 |
||||
|
@ -1 +1 @@ |
||||
0c4da879ace9c1517c2324b35da7ff05a4744dd9 |
||||
67c8f283858998b75ac28879e1350a589a968e5d |
Loading…
Reference in new issue