thneed saves 45% of a core (#1512)
	
		
	
				
					
				
			* thneed runs the model * thneed is doing the hooking * set kernel args * thneeding the bufferS * print the images well * thneeds with better buffers * includes * disasm adreno * parse packets * disasm works * disasm better * more thneeding * much thneeding * much more thneeding * thneed works i think * thneed is patient * thneed works * 7.7% * gpuobj sync * yay, it mallocs now * cleaning it up, Thneed * sync objs and set power * thneed needs inputs and outputs * thneed in modeld * special modeld runs * can't thneed the DSP * test is weird * thneed modeld uses 6.4% CPU * add thneed to release * move to debug * delete some junk from the pr * always track the timestamp * timestamp hacks in thneed * create a new command queue * fix timestamp * pretty much back to what we had, you can't use SNPE with thneed * improve thneed test * disable save log Co-authored-by: Comma Device <device@comma.ai>pull/1518/head
							parent
							
								
									3d75b4d7c0
								
							
						
					
					
						commit
						302d06ee70
					
				
				 15 changed files with 9788 additions and 6 deletions
			
			
		@ -0,0 +1,8 @@ | 
				
			||||
thneed is an SNPE accelerator. I know SNPE is already an accelerator, but sometimes things need to go even faster.. | 
				
			||||
 | 
				
			||||
It runs on the local device, and caches a single model run. Then it replays it, but fast. | 
				
			||||
 | 
				
			||||
thneed slices through abstraction layers like a fish. | 
				
			||||
 | 
				
			||||
You need a thneed. | 
				
			||||
 | 
				
			||||
@ -0,0 +1 @@ | 
				
			||||
_thneed | 
				
			||||
									
										
											File diff suppressed because it is too large
											Load Diff
										
									
								
							
						
									
										
											File diff suppressed because it is too large
											Load Diff
										
									
								
							
						@ -0,0 +1,473 @@ | 
				
			||||
/* Copyright (c) 2002,2007-2015, The Linux Foundation. All rights reserved.
 | 
				
			||||
 * | 
				
			||||
 * This program is free software; you can redistribute it and/or modify | 
				
			||||
 * it under the terms of the GNU General Public License version 2 and | 
				
			||||
 * only version 2 as published by the Free Software Foundation. | 
				
			||||
 * | 
				
			||||
 * This program is distributed in the hope that it will be useful, | 
				
			||||
 * but WITHOUT ANY WARRANTY; without even the implied warranty of | 
				
			||||
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
				
			||||
 * GNU General Public License for more details. | 
				
			||||
 * | 
				
			||||
 */ | 
				
			||||
#ifndef __ADRENO_PM4TYPES_H | 
				
			||||
#define __ADRENO_PM4TYPES_H | 
				
			||||
 | 
				
			||||
//#include "adreno.h"
 | 
				
			||||
 | 
				
			||||
#define CP_PKT_MASK	0xc0000000 | 
				
			||||
 | 
				
			||||
#define CP_TYPE0_PKT	((unsigned int)0 << 30) | 
				
			||||
#define CP_TYPE3_PKT	((unsigned int)3 << 30) | 
				
			||||
#define CP_TYPE4_PKT    ((unsigned int)4 << 28) | 
				
			||||
#define CP_TYPE7_PKT    ((unsigned int)7 << 28) | 
				
			||||
 | 
				
			||||
#define PM4_TYPE4_PKT_SIZE_MAX  128 | 
				
			||||
 | 
				
			||||
/* type3 packets */ | 
				
			||||
 | 
				
			||||
/* Enable preemption flag */ | 
				
			||||
#define CP_PREEMPT_ENABLE 0x1C | 
				
			||||
/* Preemption token command on which preemption occurs */ | 
				
			||||
#define CP_PREEMPT_TOKEN 0x1E | 
				
			||||
/* Bit to set in CP_PREEMPT_TOKEN ordinal for interrupt on preemption */ | 
				
			||||
#define CP_PREEMPT_ORDINAL_INTERRUPT 24 | 
				
			||||
/* copy from ME scratch RAM to a register */ | 
				
			||||
#define CP_SCRATCH_TO_REG      0x4d | 
				
			||||
 | 
				
			||||
/* Copy from REG to ME scratch RAM */ | 
				
			||||
#define CP_REG_TO_SCRATCH      0x4a | 
				
			||||
 | 
				
			||||
/* Wait for memory writes to complete */ | 
				
			||||
#define CP_WAIT_MEM_WRITES     0x12 | 
				
			||||
 | 
				
			||||
/* Conditional execution based on register comparison */ | 
				
			||||
#define CP_COND_REG_EXEC       0x47 | 
				
			||||
 | 
				
			||||
/* Memory to REG copy */ | 
				
			||||
#define CP_MEM_TO_REG          0x42 | 
				
			||||
 | 
				
			||||
/* initialize CP's micro-engine */ | 
				
			||||
#define CP_ME_INIT		0x48 | 
				
			||||
 | 
				
			||||
/* skip N 32-bit words to get to the next packet */ | 
				
			||||
#define CP_NOP			0x10 | 
				
			||||
 | 
				
			||||
/* indirect buffer dispatch.  same as IB, but init is pipelined */ | 
				
			||||
#define CP_INDIRECT_BUFFER_PFD	0x37 | 
				
			||||
 | 
				
			||||
/* wait for the IDLE state of the engine */ | 
				
			||||
#define CP_WAIT_FOR_IDLE	0x26 | 
				
			||||
 | 
				
			||||
/* wait until a register or memory location is a specific value */ | 
				
			||||
#define CP_WAIT_REG_MEM	0x3c | 
				
			||||
 | 
				
			||||
/* wait until a register location is equal to a specific value */ | 
				
			||||
#define CP_WAIT_REG_EQ		0x52 | 
				
			||||
 | 
				
			||||
/* switches SMMU pagetable, used on a5xx only */ | 
				
			||||
#define CP_SMMU_TABLE_UPDATE 0x53 | 
				
			||||
 | 
				
			||||
/* wait until a read completes */ | 
				
			||||
#define CP_WAIT_UNTIL_READ	0x5c | 
				
			||||
 | 
				
			||||
/* wait until all base/size writes from an IB_PFD packet have completed */ | 
				
			||||
#define CP_WAIT_IB_PFD_COMPLETE 0x5d | 
				
			||||
 | 
				
			||||
/* register read/modify/write */ | 
				
			||||
#define CP_REG_RMW		0x21 | 
				
			||||
 | 
				
			||||
/* Set binning configuration registers */ | 
				
			||||
#define CP_SET_BIN_DATA             0x2f | 
				
			||||
 | 
				
			||||
/* reads register in chip and writes to memory */ | 
				
			||||
#define CP_REG_TO_MEM		0x3e | 
				
			||||
 | 
				
			||||
/* write N 32-bit words to memory */ | 
				
			||||
#define CP_MEM_WRITE		0x3d | 
				
			||||
 | 
				
			||||
/* write CP_PROG_COUNTER value to memory */ | 
				
			||||
#define CP_MEM_WRITE_CNTR	0x4f | 
				
			||||
 | 
				
			||||
/* conditional execution of a sequence of packets */ | 
				
			||||
#define CP_COND_EXEC		0x44 | 
				
			||||
 | 
				
			||||
/* conditional write to memory or register */ | 
				
			||||
#define CP_COND_WRITE		0x45 | 
				
			||||
 | 
				
			||||
/* generate an event that creates a write to memory when completed */ | 
				
			||||
#define CP_EVENT_WRITE		0x46 | 
				
			||||
 | 
				
			||||
/* generate a VS|PS_done event */ | 
				
			||||
#define CP_EVENT_WRITE_SHD	0x58 | 
				
			||||
 | 
				
			||||
/* generate a cache flush done event */ | 
				
			||||
#define CP_EVENT_WRITE_CFL	0x59 | 
				
			||||
 | 
				
			||||
/* generate a z_pass done event */ | 
				
			||||
#define CP_EVENT_WRITE_ZPD	0x5b | 
				
			||||
 | 
				
			||||
 | 
				
			||||
/* initiate fetch of index buffer and draw */ | 
				
			||||
#define CP_DRAW_INDX		0x22 | 
				
			||||
 | 
				
			||||
/* draw using supplied indices in packet */ | 
				
			||||
#define CP_DRAW_INDX_2		0x36 | 
				
			||||
 | 
				
			||||
/* initiate fetch of index buffer and binIDs and draw */ | 
				
			||||
#define CP_DRAW_INDX_BIN	0x34 | 
				
			||||
 | 
				
			||||
/* initiate fetch of bin IDs and draw using supplied indices */ | 
				
			||||
#define CP_DRAW_INDX_2_BIN	0x35 | 
				
			||||
 | 
				
			||||
/* New draw packets defined for A4XX */ | 
				
			||||
#define CP_DRAW_INDX_OFFSET	0x38 | 
				
			||||
#define CP_DRAW_INDIRECT	0x28 | 
				
			||||
#define CP_DRAW_INDX_INDIRECT	0x29 | 
				
			||||
#define CP_DRAW_AUTO		0x24 | 
				
			||||
 | 
				
			||||
/* begin/end initiator for viz query extent processing */ | 
				
			||||
#define CP_VIZ_QUERY		0x23 | 
				
			||||
 | 
				
			||||
/* fetch state sub-blocks and initiate shader code DMAs */ | 
				
			||||
#define CP_SET_STATE		0x25 | 
				
			||||
 | 
				
			||||
/* load constant into chip and to memory */ | 
				
			||||
#define CP_SET_CONSTANT	0x2d | 
				
			||||
 | 
				
			||||
/* load sequencer instruction memory (pointer-based) */ | 
				
			||||
#define CP_IM_LOAD		0x27 | 
				
			||||
 | 
				
			||||
/* load sequencer instruction memory (code embedded in packet) */ | 
				
			||||
#define CP_IM_LOAD_IMMEDIATE	0x2b | 
				
			||||
 | 
				
			||||
/* load constants from a location in memory */ | 
				
			||||
#define CP_LOAD_CONSTANT_CONTEXT 0x2e | 
				
			||||
 | 
				
			||||
/* selective invalidation of state pointers */ | 
				
			||||
#define CP_INVALIDATE_STATE	0x3b | 
				
			||||
 | 
				
			||||
 | 
				
			||||
/* dynamically changes shader instruction memory partition */ | 
				
			||||
#define CP_SET_SHADER_BASES	0x4A | 
				
			||||
 | 
				
			||||
/* sets the 64-bit BIN_MASK register in the PFP */ | 
				
			||||
#define CP_SET_BIN_MASK	0x50 | 
				
			||||
 | 
				
			||||
/* sets the 64-bit BIN_SELECT register in the PFP */ | 
				
			||||
#define CP_SET_BIN_SELECT	0x51 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
/* updates the current context, if needed */ | 
				
			||||
#define CP_CONTEXT_UPDATE	0x5e | 
				
			||||
 | 
				
			||||
/* generate interrupt from the command stream */ | 
				
			||||
#define CP_INTERRUPT		0x40 | 
				
			||||
 | 
				
			||||
/* A5XX Enable yield in RB only */ | 
				
			||||
#define CP_YIELD_ENABLE 0x1C | 
				
			||||
 | 
				
			||||
/* Enable/Disable/Defer A5x global preemption model */ | 
				
			||||
#define CP_PREEMPT_ENABLE_GLOBAL    0x69 | 
				
			||||
 | 
				
			||||
/* Enable/Disable A5x local preemption model */ | 
				
			||||
#define CP_PREEMPT_ENABLE_LOCAL     0x6A | 
				
			||||
 | 
				
			||||
/* Yeild token on a5xx similar to CP_PREEMPT on a4xx */ | 
				
			||||
#define CP_CONTEXT_SWITCH_YIELD     0x6B | 
				
			||||
 | 
				
			||||
/* Inform CP about current render mode (needed for a5xx preemption) */ | 
				
			||||
#define CP_SET_RENDER_MODE          0x6C | 
				
			||||
 | 
				
			||||
/* copy sequencer instruction memory to system memory */ | 
				
			||||
#define CP_IM_STORE            0x2c | 
				
			||||
 | 
				
			||||
/* test 2 memory locations to dword values specified */ | 
				
			||||
#define CP_TEST_TWO_MEMS	0x71 | 
				
			||||
 | 
				
			||||
/* Write register, ignoring context state for context sensitive registers */ | 
				
			||||
#define CP_REG_WR_NO_CTXT  0x78 | 
				
			||||
 | 
				
			||||
/*
 | 
				
			||||
 * for A4xx | 
				
			||||
 * Write to register with address that does not fit into type-0 pkt | 
				
			||||
 */ | 
				
			||||
#define CP_WIDE_REG_WRITE           0x74 | 
				
			||||
 | 
				
			||||
 | 
				
			||||
/* PFP waits until the FIFO between the PFP and the ME is empty */ | 
				
			||||
#define CP_WAIT_FOR_ME		0x13 | 
				
			||||
 | 
				
			||||
/* Record the real-time when this packet is processed by PFP */ | 
				
			||||
#define CP_RECORD_PFP_TIMESTAMP	0x11 | 
				
			||||
 | 
				
			||||
#define CP_SET_PROTECTED_MODE  0x5f /* sets the register protection mode */ | 
				
			||||
 | 
				
			||||
/* Used to switch GPU between secure and non-secure modes */ | 
				
			||||
#define CP_SET_SECURE_MODE 0x66 | 
				
			||||
 | 
				
			||||
#define CP_BOOTSTRAP_UCODE  0x6f /* bootstraps microcode */ | 
				
			||||
 | 
				
			||||
/*
 | 
				
			||||
 * for a3xx | 
				
			||||
 */ | 
				
			||||
 | 
				
			||||
#define CP_LOAD_STATE 0x30 /* load high level sequencer command */ | 
				
			||||
 | 
				
			||||
/* Conditionally load a IB based on a flag */ | 
				
			||||
#define CP_COND_INDIRECT_BUFFER_PFE 0x3A /* prefetch enabled */ | 
				
			||||
#define CP_COND_INDIRECT_BUFFER_PFD 0x32 /* prefetch disabled */ | 
				
			||||
 | 
				
			||||
/* Load a buffer with pre-fetch enabled */ | 
				
			||||
#define CP_INDIRECT_BUFFER_PFE 0x3F | 
				
			||||
 | 
				
			||||
#define CP_EXEC_CL 0x31 | 
				
			||||
 | 
				
			||||
/* (A4x) save PM4 stream pointers to execute upon a visible draw */ | 
				
			||||
#define CP_SET_DRAW_STATE 0x43 | 
				
			||||
 | 
				
			||||
#define CP_LOADSTATE_DSTOFFSET_SHIFT 0x00000000 | 
				
			||||
#define CP_LOADSTATE_STATESRC_SHIFT 0x00000010 | 
				
			||||
#define CP_LOADSTATE_STATEBLOCKID_SHIFT 0x00000013 | 
				
			||||
#define CP_LOADSTATE_NUMOFUNITS_SHIFT 0x00000016 | 
				
			||||
#define CP_LOADSTATE_STATETYPE_SHIFT 0x00000000 | 
				
			||||
#define CP_LOADSTATE_EXTSRCADDR_SHIFT 0x00000002 | 
				
			||||
 | 
				
			||||
static inline uint pm4_calc_odd_parity_bit(uint val) | 
				
			||||
{ | 
				
			||||
	return (0x9669 >> (0xf & ((val) ^ | 
				
			||||
	((val) >> 4) ^ ((val) >> 8) ^ ((val) >> 12) ^ | 
				
			||||
	((val) >> 16) ^ ((val) >> 20) ^ ((val) >> 24) ^ | 
				
			||||
	((val) >> 28)))) & 1; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
/*
 | 
				
			||||
 * PM4 packet header functions | 
				
			||||
 * For all the packet functions the passed in count should be the size of the | 
				
			||||
 * payload excluding the header | 
				
			||||
 */ | 
				
			||||
static inline uint cp_type0_packet(uint regindx, uint cnt) | 
				
			||||
{ | 
				
			||||
	return CP_TYPE0_PKT | ((cnt-1) << 16) | ((regindx) & 0x7FFF); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
static inline uint cp_type3_packet(uint opcode, uint cnt) | 
				
			||||
{ | 
				
			||||
	return CP_TYPE3_PKT | ((cnt-1) << 16) | (((opcode) & 0xFF) << 8); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
static inline uint cp_type4_packet(uint opcode, uint cnt) | 
				
			||||
{ | 
				
			||||
	return CP_TYPE4_PKT | ((cnt) << 0) | | 
				
			||||
	(pm4_calc_odd_parity_bit(cnt) << 7) | | 
				
			||||
	(((opcode) & 0x3FFFF) << 8) | | 
				
			||||
	((pm4_calc_odd_parity_bit(opcode) << 27)); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
static inline uint cp_type7_packet(uint opcode, uint cnt) | 
				
			||||
{ | 
				
			||||
	return CP_TYPE7_PKT | ((cnt) << 0) | | 
				
			||||
	(pm4_calc_odd_parity_bit(cnt) << 15) | | 
				
			||||
	(((opcode) & 0x7F) << 16) | | 
				
			||||
	((pm4_calc_odd_parity_bit(opcode) << 23)); | 
				
			||||
 | 
				
			||||
} | 
				
			||||
 | 
				
			||||
#define pkt_is_type0(pkt) (((pkt) & 0XC0000000) == CP_TYPE0_PKT) | 
				
			||||
 | 
				
			||||
#define type0_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1) | 
				
			||||
#define type0_pkt_offset(pkt) ((pkt) & 0x7FFF) | 
				
			||||
 | 
				
			||||
/*
 | 
				
			||||
 * Check both for the type3 opcode and make sure that the reserved bits [1:7] | 
				
			||||
 * and 15 are 0 | 
				
			||||
 */ | 
				
			||||
 | 
				
			||||
#define pkt_is_type3(pkt) \ | 
				
			||||
	((((pkt) & 0xC0000000) == CP_TYPE3_PKT) && \
 | 
				
			||||
	 (((pkt) & 0x80FE) == 0)) | 
				
			||||
 | 
				
			||||
#define cp_type3_opcode(pkt) (((pkt) >> 8) & 0xFF) | 
				
			||||
#define type3_pkt_size(pkt) ((((pkt) >> 16) & 0x3FFF) + 1) | 
				
			||||
 | 
				
			||||
#define pkt_is_type4(pkt) \ | 
				
			||||
	((((pkt) & 0xF0000000) == CP_TYPE4_PKT) && \
 | 
				
			||||
	 ((((pkt) >> 27) & 0x1) == \
 | 
				
			||||
	 pm4_calc_odd_parity_bit(cp_type4_base_index_one_reg_wr(pkt))) \
 | 
				
			||||
	 && ((((pkt) >> 7) & 0x1) == \
 | 
				
			||||
	 pm4_calc_odd_parity_bit(type4_pkt_size(pkt)))) | 
				
			||||
 | 
				
			||||
#define cp_type4_base_index_one_reg_wr(pkt) (((pkt) >> 8) & 0x7FFFF) | 
				
			||||
#define type4_pkt_size(pkt) ((pkt) & 0x7F) | 
				
			||||
 | 
				
			||||
#define pkt_is_type7(pkt) \ | 
				
			||||
	((((pkt) & 0xF0000000) == CP_TYPE7_PKT) && \
 | 
				
			||||
	 (((pkt) & 0x0F000000) == 0) && \
 | 
				
			||||
	 ((((pkt) >> 23) & 0x1) == \
 | 
				
			||||
	 pm4_calc_odd_parity_bit(cp_type7_opcode(pkt))) \
 | 
				
			||||
	 && ((((pkt) >> 15) & 0x1) == \
 | 
				
			||||
	 pm4_calc_odd_parity_bit(type7_pkt_size(pkt)))) | 
				
			||||
 | 
				
			||||
#define cp_type7_opcode(pkt) (((pkt) >> 16) & 0x7F) | 
				
			||||
#define type7_pkt_size(pkt) ((pkt) & 0x3FFF) | 
				
			||||
 | 
				
			||||
/* dword base address of the GFX decode space */ | 
				
			||||
#define SUBBLOCK_OFFSET(reg) ((unsigned int)((reg) - (0x2000))) | 
				
			||||
 | 
				
			||||
/* gmem command buffer length */ | 
				
			||||
#define CP_REG(reg) ((0x4 << 16) | (SUBBLOCK_OFFSET(reg))) | 
				
			||||
 | 
				
			||||
// add these
 | 
				
			||||
#define ADRENO_GPUREV(x) 530 | 
				
			||||
#define lower_32_bits(n) ((uint32_t)(n)) | 
				
			||||
#define upper_32_bits(n) ((uint32_t)(((n) >> 16) >> 16)) | 
				
			||||
 | 
				
			||||
/* Return true if the hardware uses the legacy (A4XX and older) PM4 format */ | 
				
			||||
#define ADRENO_LEGACY_PM4(_d) (ADRENO_GPUREV(_d) < 500) | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * cp_packet - Generic CP packet to support different opcodes on | 
				
			||||
 * different GPU cores. | 
				
			||||
 * @adreno_dev: The adreno device | 
				
			||||
 * @opcode: Operation for cp packet | 
				
			||||
 * @size: size for cp packet | 
				
			||||
 */ | 
				
			||||
static inline uint cp_packet(struct adreno_device *adreno_dev, | 
				
			||||
				int opcode, uint size) | 
				
			||||
{ | 
				
			||||
	if (ADRENO_LEGACY_PM4(adreno_dev)) | 
				
			||||
		return cp_type3_packet(opcode, size); | 
				
			||||
 | 
				
			||||
	return cp_type7_packet(opcode, size); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * cp_mem_packet - Generic CP memory packet to support different | 
				
			||||
 * opcodes on different GPU cores. | 
				
			||||
 * @adreno_dev: The adreno device | 
				
			||||
 * @opcode: mem operation for cp packet | 
				
			||||
 * @size: size for cp packet | 
				
			||||
 * @num_mem: num of mem access | 
				
			||||
 */ | 
				
			||||
static inline uint cp_mem_packet(struct adreno_device *adreno_dev, | 
				
			||||
				int opcode, uint size, uint num_mem) | 
				
			||||
{ | 
				
			||||
	if (ADRENO_LEGACY_PM4(adreno_dev)) | 
				
			||||
		return cp_type3_packet(opcode, size); | 
				
			||||
 | 
				
			||||
	return cp_type7_packet(opcode, size + num_mem); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
/* Return 1 if the command is an indirect buffer of any kind */ | 
				
			||||
static inline int adreno_cmd_is_ib(struct adreno_device *adreno_dev, | 
				
			||||
					unsigned int cmd) | 
				
			||||
{ | 
				
			||||
	return cmd == cp_mem_packet(adreno_dev, | 
				
			||||
			CP_INDIRECT_BUFFER_PFE, 2, 1) || | 
				
			||||
		cmd == cp_mem_packet(adreno_dev, | 
				
			||||
			CP_INDIRECT_BUFFER_PFD, 2, 1) || | 
				
			||||
		cmd == cp_mem_packet(adreno_dev, | 
				
			||||
			CP_COND_INDIRECT_BUFFER_PFE, 2, 1) || | 
				
			||||
		cmd == cp_mem_packet(adreno_dev, | 
				
			||||
			CP_COND_INDIRECT_BUFFER_PFD, 2, 1); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * cp_gpuaddr - Generic function to add 64bit and 32bit gpuaddr | 
				
			||||
 * to pm4 commands | 
				
			||||
 * @adreno_dev: The adreno device | 
				
			||||
 * @cmds: command pointer to add gpuaddr | 
				
			||||
 * @gpuaddr: gpuaddr to add | 
				
			||||
 */ | 
				
			||||
static inline uint cp_gpuaddr(struct adreno_device *adreno_dev, | 
				
			||||
		   uint *cmds, uint64_t gpuaddr) | 
				
			||||
{ | 
				
			||||
	uint *start = cmds; | 
				
			||||
 | 
				
			||||
	if (ADRENO_LEGACY_PM4(adreno_dev)) | 
				
			||||
		*cmds++ = (uint)gpuaddr; | 
				
			||||
	else { | 
				
			||||
		*cmds++ = lower_32_bits(gpuaddr); | 
				
			||||
		*cmds++ = upper_32_bits(gpuaddr); | 
				
			||||
	} | 
				
			||||
	return cmds - start; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * cp_register - Generic function for gpu register operation | 
				
			||||
 * @adreno_dev: The adreno device | 
				
			||||
 * @reg: GPU register | 
				
			||||
 * @size: count for PM4 operation | 
				
			||||
 */ | 
				
			||||
static inline uint cp_register(struct adreno_device *adreno_dev, | 
				
			||||
			unsigned int reg, unsigned int size) | 
				
			||||
{ | 
				
			||||
	if (ADRENO_LEGACY_PM4(adreno_dev)) | 
				
			||||
		return cp_type0_packet(reg, size); | 
				
			||||
 | 
				
			||||
	return cp_type4_packet(reg, size); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * cp_wait_for_me - common function for WAIT_FOR_ME | 
				
			||||
 * @adreno_dev: The adreno device | 
				
			||||
 * @cmds: command pointer to add gpuaddr | 
				
			||||
 */ | 
				
			||||
static inline uint cp_wait_for_me(struct adreno_device *adreno_dev, | 
				
			||||
				uint *cmds) | 
				
			||||
{ | 
				
			||||
	uint *start = cmds; | 
				
			||||
 | 
				
			||||
	if (ADRENO_LEGACY_PM4(adreno_dev)) { | 
				
			||||
		*cmds++ = cp_type3_packet(CP_WAIT_FOR_ME, 1); | 
				
			||||
		*cmds++ = 0; | 
				
			||||
	} else | 
				
			||||
		*cmds++ = cp_type7_packet(CP_WAIT_FOR_ME, 0); | 
				
			||||
 | 
				
			||||
	return cmds - start; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * cp_wait_for_idle - common function for WAIT_FOR_IDLE | 
				
			||||
 * @adreno_dev: The adreno device | 
				
			||||
 * @cmds: command pointer to add gpuaddr | 
				
			||||
 */ | 
				
			||||
static inline uint cp_wait_for_idle(struct adreno_device *adreno_dev, | 
				
			||||
				uint *cmds) | 
				
			||||
{ | 
				
			||||
	uint *start = cmds; | 
				
			||||
 | 
				
			||||
	if (ADRENO_LEGACY_PM4(adreno_dev)) { | 
				
			||||
		*cmds++ = cp_type3_packet(CP_WAIT_FOR_IDLE, 1); | 
				
			||||
		*cmds++ = 0; | 
				
			||||
	} else | 
				
			||||
		*cmds++ = cp_type7_packet(CP_WAIT_FOR_IDLE, 0); | 
				
			||||
 | 
				
			||||
	return cmds - start; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
/**
 | 
				
			||||
 * cp_invalidate_state - common function for invalidating cp | 
				
			||||
 * state | 
				
			||||
 * @adreno_dev: The adreno device | 
				
			||||
 * @cmds: command pointer to add gpuaddr | 
				
			||||
 */ | 
				
			||||
static inline uint cp_invalidate_state(struct adreno_device *adreno_dev, | 
				
			||||
				uint *cmds) | 
				
			||||
{ | 
				
			||||
	uint *start = cmds; | 
				
			||||
 | 
				
			||||
	if (ADRENO_GPUREV(adreno_dev) < 500) { | 
				
			||||
		*cmds++ = cp_type3_packet(CP_INVALIDATE_STATE, 1); | 
				
			||||
		*cmds++ = 0x7fff; | 
				
			||||
	} else { | 
				
			||||
		*cmds++ = cp_type7_packet(CP_SET_DRAW_STATE, 3); | 
				
			||||
		*cmds++ = 0x40000; | 
				
			||||
		*cmds++ = 0; | 
				
			||||
		*cmds++ = 0; | 
				
			||||
	} | 
				
			||||
 | 
				
			||||
	return cmds - start; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
#endif	/* __ADRENO_PM4TYPES_H */ | 
				
			||||
@ -0,0 +1,733 @@ | 
				
			||||
#include <sys/types.h> | 
				
			||||
#include "include/msm_kgsl.h" | 
				
			||||
#include <stdio.h> | 
				
			||||
#include <stdlib.h> | 
				
			||||
#include <dlfcn.h> | 
				
			||||
#include <cassert> | 
				
			||||
#include <sys/mman.h> | 
				
			||||
 | 
				
			||||
int run_num = 0; | 
				
			||||
int ioctl_num = 0; | 
				
			||||
 | 
				
			||||
void hexdump(uint32_t *d, int len) { | 
				
			||||
  assert((len%4) == 0); | 
				
			||||
  printf("  dumping %p len 0x%x\n", d, len); | 
				
			||||
  for (int i = 0; i < len/4; i++) { | 
				
			||||
    if (i != 0 && (i%0x10) == 0) printf("\n"); | 
				
			||||
    printf("%8x ", d[i]); | 
				
			||||
  } | 
				
			||||
  printf("\n"); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
void hexdump8(uint8_t *d, int len) { | 
				
			||||
  printf("  dumping %p len 0x%x\n", d, len); | 
				
			||||
  for (int i = 0; i < len; i++) { | 
				
			||||
    if (i != 0 && (i%0x10) == 0) printf("\n"); | 
				
			||||
    printf("%02x ", d[i]); | 
				
			||||
  } | 
				
			||||
  printf("\n"); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
 | 
				
			||||
#include <string> | 
				
			||||
#include <vector> | 
				
			||||
#include <map> | 
				
			||||
using namespace std; | 
				
			||||
 | 
				
			||||
#include "disasm/include/adreno_pm4types.h" | 
				
			||||
 | 
				
			||||
#define REG_A5XX_TPL1_CS_TEX_CONST_LO        0x0000e760 | 
				
			||||
#define REG_A5XX_TPL1_CS_TEX_SAMP_LO         0x0000e75c | 
				
			||||
 | 
				
			||||
class CachedCommand { | 
				
			||||
  public: | 
				
			||||
    CachedCommand(struct kgsl_gpu_command *cmd, int lfd); | 
				
			||||
    void exec(bool wait); | 
				
			||||
  private: | 
				
			||||
    string cmd_0, cmd_1; | 
				
			||||
    int obj_len; | 
				
			||||
    int fd; | 
				
			||||
 | 
				
			||||
    struct kgsl_gpu_command cache; | 
				
			||||
    struct kgsl_command_object cmds[2]; | 
				
			||||
    struct kgsl_command_object objs[1]; | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
vector<CachedCommand *> queue_cmds; | 
				
			||||
 | 
				
			||||
void disassemble(uint32_t *src, int len) { | 
				
			||||
  int i = 0; | 
				
			||||
  while (i < len) { | 
				
			||||
		int pktsize; | 
				
			||||
    int pkttype = -1; | 
				
			||||
 | 
				
			||||
		if (pkt_is_type0(src[i])) { | 
				
			||||
      pkttype = 0; | 
				
			||||
			pktsize = type0_pkt_size(src[i]); | 
				
			||||
		} else if (pkt_is_type3(src[i])) { | 
				
			||||
      pkttype = 3; | 
				
			||||
			pktsize = type3_pkt_size(src[i]); | 
				
			||||
		} else if (pkt_is_type4(src[i])) { | 
				
			||||
      pkttype = 4; | 
				
			||||
      pktsize = type4_pkt_size(src[i]); | 
				
			||||
    } else if (pkt_is_type7(src[i])) { | 
				
			||||
      pkttype = 7; | 
				
			||||
      pktsize = type7_pkt_size(src[i]); | 
				
			||||
    } | 
				
			||||
    printf("%3d: type:%d size:%d ", i, pkttype, pktsize); | 
				
			||||
 | 
				
			||||
    if (pkttype == 7) { | 
				
			||||
      printf("op:  %4x ", cp_type7_opcode(src[i])); | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    if (pkttype == 4) { | 
				
			||||
      printf("reg: %4x ", cp_type4_base_index_one_reg_wr(src[i])); | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    for (int j = 0; j < pktsize+1; j++) { | 
				
			||||
      printf("%8.8X ", src[i+j]); | 
				
			||||
    } | 
				
			||||
    printf("\n"); | 
				
			||||
 | 
				
			||||
    if (pkttype == 7 && cp_type7_opcode(src[i]) == CP_LOAD_STATE) { | 
				
			||||
      // CP_LOAD_STATE4
 | 
				
			||||
      int sz = (src[i+1] & 0xffc00000) >> 22; | 
				
			||||
      uint64_t addr = (uint64_t)(src[i+2] & 0xfffffffc) | ((uint64_t)(src[i+3]) << 32); | 
				
			||||
      hexdump((uint32_t *)addr, sz*4); | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    if (pkttype == 4 && cp_type4_base_index_one_reg_wr(src[i]) == REG_A5XX_TPL1_CS_TEX_CONST_LO) { | 
				
			||||
      uint64_t addr = (uint64_t)(src[i+1] & 0xffffffff) | ((uint64_t)(src[i+2]) << 32); | 
				
			||||
      hexdump((uint32_t *)addr, 0x40); | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    if (pkttype == 4 && cp_type4_base_index_one_reg_wr(src[i]) == REG_A5XX_TPL1_CS_TEX_SAMP_LO) { | 
				
			||||
      uint64_t addr = (uint64_t)(src[i+1] & 0xffffffff) | ((uint64_t)(src[i+2]) << 32); | 
				
			||||
      hexdump((uint32_t *)addr, 0x40); | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    if (pkttype == -1) break; | 
				
			||||
    i += (1+pktsize); | 
				
			||||
  } | 
				
			||||
  assert(i == len); | 
				
			||||
 | 
				
			||||
} | 
				
			||||
 | 
				
			||||
int intercept = 1; | 
				
			||||
int prop_num = 0; | 
				
			||||
 | 
				
			||||
extern "C" { | 
				
			||||
 | 
				
			||||
/*void *gsl_memory_alloc_pure(long param_1, long param_2, long *param_3) {
 | 
				
			||||
  void *(*my_gsl_memory_alloc_pure)(long param_1, long param_2, long *param_3); | 
				
			||||
  my_gsl_memory_alloc_pure = reinterpret_cast<decltype(my_gsl_memory_alloc_pure)>(dlsym(RTLD_NEXT, "gsl_memory_alloc_pure")); | 
				
			||||
 | 
				
			||||
  void *ret = my_gsl_memory_alloc_pure(param_1, param_2, param_3); | 
				
			||||
  printf("gsl_memory_alloc_pure: 0x%lx 0x%lx %p = %p\n", param_1, param_2, param_3, ret); | 
				
			||||
  return ret; | 
				
			||||
}*/ | 
				
			||||
 | 
				
			||||
void *mmap64(void *addr, size_t len, int prot, int flags, int fildes, off64_t off) { | 
				
			||||
  void *(*my_mmap64)(void *addr, size_t len, int prot, int flags, int fildes, off64_t off); | 
				
			||||
  my_mmap64 = reinterpret_cast<decltype(my_mmap64)>(dlsym(RTLD_NEXT, "mmap64")); | 
				
			||||
 | 
				
			||||
  void *ret = my_mmap64(addr, len, prot, flags, fildes, off); | 
				
			||||
 | 
				
			||||
  if (fildes == 3) { | 
				
			||||
    printf("mmap64(addr=%p, len=0x%zx, prot=0x%x, flags=0x%x, fildes=%d, off=0x%lx) = %p\n", addr, len, prot, flags, fildes, off, ret); | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  return ret; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
 | 
				
			||||
pid_t gettid(void); | 
				
			||||
 | 
				
			||||
#undef ioctl | 
				
			||||
int ioctl(int filedes, unsigned long request, void *argp) { | 
				
			||||
  int (*my_ioctl)(int filedes, unsigned long request, void *argp); | 
				
			||||
  my_ioctl = reinterpret_cast<decltype(my_ioctl)>(dlsym(RTLD_NEXT, "ioctl")); | 
				
			||||
  int skip = 0; | 
				
			||||
 | 
				
			||||
if (intercept) { | 
				
			||||
 | 
				
			||||
  int tid = gettid(); | 
				
			||||
 | 
				
			||||
  if (request == IOCTL_KGSL_GPU_COMMAND) { | 
				
			||||
    struct kgsl_gpu_command *cmd = (struct kgsl_gpu_command *)argp; | 
				
			||||
    printf("IOCTL_KGSL_GPU_COMMAND(%d): flags: 0x%lx numcmds: %u   numobjs: %u  numsyncs: %u   context_id: %u  timestamp: %u\n", | 
				
			||||
        tid, | 
				
			||||
        cmd->flags, | 
				
			||||
        cmd->numcmds, cmd->numobjs, cmd->numsyncs, | 
				
			||||
        cmd->context_id, cmd->timestamp); | 
				
			||||
 | 
				
			||||
    assert(cmd->numcmds == 2); | 
				
			||||
    assert(cmd->numobjs == 1); | 
				
			||||
    assert(cmd->numsyncs == 0); | 
				
			||||
 | 
				
			||||
    //struct kgsl_command_object *obj = (struct kgsl_command_object *)cmd->cmdlist;
 | 
				
			||||
    //assert(obj[0].size == sizeof(queue_init));
 | 
				
			||||
    //memcpy(queue_init, (void*)obj[0].gpuaddr, sizeof(queue_init));
 | 
				
			||||
    //string qcmd((char*)obj[1].gpuaddr, obj[1].size);
 | 
				
			||||
    if (run_num == 3) { | 
				
			||||
      CachedCommand *ccmd = new CachedCommand(cmd, filedes); | 
				
			||||
      queue_cmds.push_back(ccmd); | 
				
			||||
 | 
				
			||||
      //ccmd->exec();
 | 
				
			||||
 | 
				
			||||
      //skip = 0;
 | 
				
			||||
      //printf("command 0x%lx\n", obj[1].gpuaddr);
 | 
				
			||||
      //disassemble((uint32_t *)qcmd.data(), qcmd.size()/4);
 | 
				
			||||
      //queue_cmds.push_back(qcmd);
 | 
				
			||||
    } | 
				
			||||
    
 | 
				
			||||
    #ifdef DUMP | 
				
			||||
      char tmp[0x100]; | 
				
			||||
      snprintf(tmp, sizeof(tmp), "/tmp/thneed/run_%d_%d", run_num, ioctl_num++); | 
				
			||||
      FILE *f = fopen(tmp, "wb"); | 
				
			||||
    #endif | 
				
			||||
 | 
				
			||||
    // kgsl_cmdbatch_add_cmdlist
 | 
				
			||||
    for (int i = 0; i < cmd->numcmds; i++) { | 
				
			||||
      struct kgsl_command_object *obj = (struct kgsl_command_object *)cmd->cmdlist; | 
				
			||||
      printf("  cmd: %lx %5lx %5lx flags:%3x %d\n", | 
				
			||||
          obj[i].offset, obj[i].gpuaddr, obj[i].size, obj[i].flags, obj[i].id); | 
				
			||||
      //hexdump((uint32_t *)obj[i].gpuaddr, obj[i].size);
 | 
				
			||||
      #ifdef DUMP | 
				
			||||
        fwrite(&obj[i].size, sizeof(obj[i].size), 1, f); | 
				
			||||
        fwrite((void*)obj[i].gpuaddr, obj[i].size, 1, f); | 
				
			||||
      #endif | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    // kgsl_cmdbatch_add_memlist
 | 
				
			||||
    for (int i = 0; i < cmd->numobjs; i++) { | 
				
			||||
      struct kgsl_command_object *obj = (struct kgsl_command_object *)cmd->objlist; | 
				
			||||
      printf("  obj: %lx %5lx %5lx flags:%3x %d\n", | 
				
			||||
          obj[i].offset, obj[i].gpuaddr, obj[i].size, obj[i].flags, obj[i].id); | 
				
			||||
      //hexdump((uint32_t *)obj[i].gpuaddr, obj[i].size);
 | 
				
			||||
 | 
				
			||||
      #ifdef DUMP | 
				
			||||
        fwrite(&obj[i].size, sizeof(obj[i].size), 1, f); | 
				
			||||
        fwrite((void*)obj[i].gpuaddr, obj[i].size, 1, f); | 
				
			||||
      #endif | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    #ifdef DUMP | 
				
			||||
      fclose(f); | 
				
			||||
    #endif | 
				
			||||
 | 
				
			||||
  } else if (request == IOCTL_KGSL_SETPROPERTY) { | 
				
			||||
    struct kgsl_device_getproperty *prop = (struct kgsl_device_getproperty *)argp; | 
				
			||||
    printf("IOCTL_KGSL_SETPROPERTY(%d): 0x%x\n", tid, prop->type); | 
				
			||||
    hexdump8((uint8_t*)prop->value, prop->sizebytes); | 
				
			||||
    if (prop_num == 1) { printf("SKIPPING\n"); skip = 1; } | 
				
			||||
    if (run_num == 3) prop_num++; | 
				
			||||
    //hexdump((unsigned char*)prop->value, prop->sizebytes);
 | 
				
			||||
  } else if (request == IOCTL_KGSL_GPUOBJ_SYNC) { | 
				
			||||
    struct kgsl_gpuobj_sync *cmd = (struct kgsl_gpuobj_sync *)argp; | 
				
			||||
    struct kgsl_gpuobj_sync_obj *objs = (struct kgsl_gpuobj_sync_obj *)(cmd->objs); | 
				
			||||
 | 
				
			||||
    printf("IOCTL_KGSL_GPUOBJ_SYNC(%d) count:%d ", tid, cmd->count); | 
				
			||||
    for (int i = 0; i < cmd->count; i++) { | 
				
			||||
      printf(" -- offset:0x%lx len:0x%lx id:%d op:%d  ", objs[i].offset, objs[i].length, objs[i].id, objs[i].op); | 
				
			||||
    } | 
				
			||||
    printf("\n"); | 
				
			||||
  } else if (request == IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID) { | 
				
			||||
    struct kgsl_device_waittimestamp_ctxtid *cmd = (struct kgsl_device_waittimestamp_ctxtid *)argp; | 
				
			||||
    printf("IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID(%d): context_id: %d  timestamp: %d  timeout: %d\n", | 
				
			||||
        tid, cmd->context_id, cmd->timestamp, cmd->timeout); | 
				
			||||
  } else if (request == IOCTL_KGSL_GPUOBJ_ALLOC) { | 
				
			||||
    struct kgsl_gpuobj_alloc *cmd = (struct kgsl_gpuobj_alloc *)argp; | 
				
			||||
    printf("IOCTL_KGSL_GPUOBJ_ALLOC: size:0x%lx flags:0x%lx va_len:0x%lx  ", cmd->size, cmd->flags, cmd->va_len); | 
				
			||||
  } else if (request == IOCTL_KGSL_GPUOBJ_FREE) { | 
				
			||||
    //printf("IOCTL_KGSL_GPUOBJ_FREE\n");
 | 
				
			||||
  } else if (filedes == 3) { | 
				
			||||
    printf("ioctl(%d) %lx\n", tid, request); | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
} | 
				
			||||
 | 
				
			||||
  int ret; | 
				
			||||
  if (skip) { | 
				
			||||
    ret = 0; | 
				
			||||
  } else { | 
				
			||||
    ret = my_ioctl(filedes, request, argp); | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  if (request == IOCTL_KGSL_GPUOBJ_ALLOC) { | 
				
			||||
    struct kgsl_gpuobj_alloc *cmd = (struct kgsl_gpuobj_alloc *)argp; | 
				
			||||
    printf("mmapsize:0x%lx id:%d metadata_len:%x metadata:0x%lx = %d\n", cmd->mmapsize, cmd->id, cmd->metadata_len, cmd->metadata, ret); | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  return ret; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
} | 
				
			||||
 | 
				
			||||
#include <CL/cl.h> | 
				
			||||
#include "../runners/snpemodel.h" | 
				
			||||
#include <sys/types.h> | 
				
			||||
#include <time.h> | 
				
			||||
 | 
				
			||||
static inline uint64_t nanos_since_boot() { | 
				
			||||
  struct timespec t; | 
				
			||||
  clock_gettime(CLOCK_BOOTTIME, &t); | 
				
			||||
  return t.tv_sec * 1000000000ULL + t.tv_nsec; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
int global_timestamp = -1; | 
				
			||||
CachedCommand::CachedCommand(struct kgsl_gpu_command *cmd, int lfd) { | 
				
			||||
  fd = lfd; | 
				
			||||
  assert(cmd->numcmds == 2); | 
				
			||||
  assert(cmd->numobjs == 1); | 
				
			||||
  assert(cmd->numsyncs == 0); | 
				
			||||
 | 
				
			||||
  global_timestamp = cmd->timestamp; | 
				
			||||
 | 
				
			||||
  printf("%p  %p %p\n", cmd, (void*)cmd->cmdlist, (void*)cmd->objlist); | 
				
			||||
 | 
				
			||||
  memcpy(cmds, (void *)cmd->cmdlist, sizeof(struct kgsl_command_object)*2); | 
				
			||||
  memcpy(objs, (void *)cmd->objlist, sizeof(struct kgsl_command_object)*1); | 
				
			||||
  cmd_0.assign((char*)cmds[0].gpuaddr, cmds[0].size); | 
				
			||||
  cmd_1.assign((char*)cmds[1].gpuaddr, cmds[1].size); | 
				
			||||
 | 
				
			||||
 | 
				
			||||
  memcpy(&cache, cmd, sizeof(cache)); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
// i think you get these with cl_a5x_ringbuffer_alloc
 | 
				
			||||
uint64_t base = 0; | 
				
			||||
 | 
				
			||||
void CachedCommand::exec(bool wait) { | 
				
			||||
  printf("old addr 0x%lx ", cmds[1].gpuaddr); | 
				
			||||
  cmds[1].gpuaddr = base; | 
				
			||||
  printf("using addr 0x%lx with size 0x%4lx ", cmds[1].gpuaddr, cmd_1.size()); | 
				
			||||
  base += (cmd_1.size()+0xff) & (~0xFF); | 
				
			||||
  memcpy((void*)cmds[1].gpuaddr, cmd_1.data(), cmd_1.size()); | 
				
			||||
 | 
				
			||||
  // set up other buffers
 | 
				
			||||
  memcpy((void*)cmds[0].gpuaddr, cmd_0.data(), cmd_0.size()); | 
				
			||||
  memset((void*)objs[0].gpuaddr, 0, objs[0].size); | 
				
			||||
 | 
				
			||||
  cache.timestamp = ++global_timestamp; | 
				
			||||
  cache.cmdlist = (uint64_t)cmds; | 
				
			||||
  cache.objlist = (uint64_t)objs; | 
				
			||||
 | 
				
			||||
  // run
 | 
				
			||||
  int ret = ioctl(fd, IOCTL_KGSL_GPU_COMMAND, &cache); | 
				
			||||
 | 
				
			||||
  if (wait) { | 
				
			||||
    struct kgsl_device_waittimestamp_ctxtid wait; | 
				
			||||
    wait.context_id = cache.context_id; | 
				
			||||
    wait.timestamp = cache.timestamp; | 
				
			||||
    wait.timeout = -1; | 
				
			||||
 | 
				
			||||
    uint64_t tb = nanos_since_boot(); | 
				
			||||
    int wret = ioctl(fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait); | 
				
			||||
    uint64_t te = nanos_since_boot(); | 
				
			||||
 | 
				
			||||
    printf("exec %d wait %d after %lu us\n", ret, wret, (te-tb)/1000); | 
				
			||||
  } else { | 
				
			||||
    printf("CachedCommand::exec got %d\n", ret); | 
				
			||||
  } | 
				
			||||
} | 
				
			||||
 | 
				
			||||
 | 
				
			||||
int do_print = 0; | 
				
			||||
 | 
				
			||||
#define TEMPORAL_SIZE 512 | 
				
			||||
#define DESIRE_LEN 8 | 
				
			||||
#define TRAFFIC_CONVENTION_LEN 2 | 
				
			||||
 | 
				
			||||
FILE *f = NULL; | 
				
			||||
 | 
				
			||||
cl_program clCreateProgramWithSource(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) { | 
				
			||||
  cl_program (*my_clCreateProgramWithSource)(cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret) = NULL; | 
				
			||||
  my_clCreateProgramWithSource = reinterpret_cast<decltype(my_clCreateProgramWithSource)>(dlsym(RTLD_NEXT, "REAL_clCreateProgramWithSource")); | 
				
			||||
  //printf("clCreateProgramWithSource: %d\n", count);
 | 
				
			||||
 | 
				
			||||
  if (f == NULL) { | 
				
			||||
    f = fopen("/tmp/kernels.cl", "w"); | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  fprintf(f, "/* ************************ PROGRAM BREAK ****************************/\n"); | 
				
			||||
  for (int i = 0; i < count; i++) { | 
				
			||||
    fprintf(f, "%s\n", strings[i]); | 
				
			||||
    if (i != 0) fprintf(f, "/* ************************ SECTION BREAK ****************************/\n"); | 
				
			||||
  } | 
				
			||||
  fflush(f); | 
				
			||||
 | 
				
			||||
  return my_clCreateProgramWithSource(context, count, strings, lengths, errcode_ret); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
map<cl_kernel, string> kernels; | 
				
			||||
map<cl_kernel, cl_mem> kernel_inputs; | 
				
			||||
map<cl_kernel, cl_mem> kernel_outputs; | 
				
			||||
 | 
				
			||||
cl_kernel clCreateKernel(cl_program program, const char *kernel_name, cl_int *errcode_ret) { | 
				
			||||
  cl_kernel (*my_clCreateKernel)(cl_program program, const char *kernel_name, cl_int *errcode_ret) = NULL; | 
				
			||||
  my_clCreateKernel = reinterpret_cast<decltype(my_clCreateKernel)>(dlsym(RTLD_NEXT, "REAL_clCreateKernel")); | 
				
			||||
  cl_kernel ret = my_clCreateKernel(program, kernel_name, errcode_ret); | 
				
			||||
 | 
				
			||||
  printf("clCreateKernel: %s -> %p\n", kernel_name, ret); | 
				
			||||
  kernels.insert(make_pair(ret, kernel_name)); | 
				
			||||
  return ret; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
typedef struct image { | 
				
			||||
  size_t image_width; | 
				
			||||
  size_t image_height; | 
				
			||||
  size_t image_row_pitch; | 
				
			||||
  cl_mem buffer; | 
				
			||||
} image; | 
				
			||||
 | 
				
			||||
map<cl_mem, size_t> buffers; | 
				
			||||
map<cl_mem, image> images; | 
				
			||||
 | 
				
			||||
cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) { | 
				
			||||
  cl_int (*my_clSetKernelArg)(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) = NULL; | 
				
			||||
  my_clSetKernelArg = reinterpret_cast<decltype(my_clSetKernelArg)>(dlsym(RTLD_NEXT, "REAL_clSetKernelArg")); | 
				
			||||
 | 
				
			||||
  char arg_type[0x100]; | 
				
			||||
  char arg_name[0x100]; | 
				
			||||
  clGetKernelArgInfo(kernel, arg_index, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_type), arg_type, NULL); | 
				
			||||
  clGetKernelArgInfo(kernel, arg_index, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL); | 
				
			||||
  printf("  %s %s", arg_type, arg_name); | 
				
			||||
 | 
				
			||||
  if (arg_size == 1) { | 
				
			||||
    printf(" = %d", *((char*)arg_value)); | 
				
			||||
  } else if (arg_size == 2) { | 
				
			||||
    printf(" = %d", *((short*)arg_value)); | 
				
			||||
  } else if (arg_size == 4) { | 
				
			||||
    if (strcmp(arg_type, "float") == 0) { | 
				
			||||
      printf(" = %f", *((float*)arg_value)); | 
				
			||||
    } else { | 
				
			||||
      printf(" = %d", *((int*)arg_value)); | 
				
			||||
    } | 
				
			||||
  } else if (arg_size == 8) { | 
				
			||||
    cl_mem val = (cl_mem)(*((uintptr_t*)arg_value)); | 
				
			||||
    printf(" = %p", val); | 
				
			||||
    if (strcmp(arg_name, "input") == 0) kernel_inputs[kernel] = val; | 
				
			||||
    if (strcmp(arg_name, "output") == 0) kernel_outputs[kernel] = val; | 
				
			||||
    if (strcmp(arg_name, "accumulator") == 0) assert(kernel_inputs[kernel] = val); | 
				
			||||
 | 
				
			||||
    if (buffers.find(val) != buffers.end()) { | 
				
			||||
      printf(" buffer %zu", buffers[val]); | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    if (images.find(val) != images.end()) { | 
				
			||||
      printf(" image %zu x %zu rp %zu @ %p", images[val].image_width, images[val].image_height, images[val].image_row_pitch, images[val].buffer); | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
  } else { | 
				
			||||
    printf(" %zu", arg_size); | 
				
			||||
  } | 
				
			||||
  printf("\n"); | 
				
			||||
  cl_int ret = my_clSetKernelArg(kernel, arg_index, arg_size, arg_value); | 
				
			||||
  return ret; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
uint64_t start_time = 0; | 
				
			||||
uint64_t tns = 0; | 
				
			||||
 | 
				
			||||
int cnt = 0; | 
				
			||||
 | 
				
			||||
cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue, | 
				
			||||
  cl_kernel kernel, | 
				
			||||
  cl_uint work_dim, | 
				
			||||
  const size_t *global_work_offset, | 
				
			||||
  const size_t *global_work_size, | 
				
			||||
  const size_t *local_work_size, | 
				
			||||
  cl_uint num_events_in_wait_list, | 
				
			||||
  const cl_event *event_wait_list, | 
				
			||||
  cl_event *event) { | 
				
			||||
 | 
				
			||||
  // SNPE doesn't use these
 | 
				
			||||
  assert(num_events_in_wait_list == 0); | 
				
			||||
  assert(global_work_offset == NULL); | 
				
			||||
 | 
				
			||||
  cl_int (*my_clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) = NULL; | 
				
			||||
  my_clEnqueueNDRangeKernel = reinterpret_cast<decltype(my_clEnqueueNDRangeKernel)>(dlsym(RTLD_NEXT, "REAL_clEnqueueNDRangeKernel")); | 
				
			||||
 | 
				
			||||
 | 
				
			||||
  uint64_t tb = nanos_since_boot(); | 
				
			||||
  cl_int ret = my_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, | 
				
			||||
    global_work_offset, global_work_size, local_work_size, | 
				
			||||
    num_events_in_wait_list, event_wait_list, event); | 
				
			||||
  uint64_t te = nanos_since_boot(); | 
				
			||||
 | 
				
			||||
  /*ret = clWaitForEvents(1, event);
 | 
				
			||||
  assert(ret == CL_SUCCESS); | 
				
			||||
  uint64_t tq = nanos_since_boot();*/ | 
				
			||||
 | 
				
			||||
  if (do_print) { | 
				
			||||
    tns += te-tb; | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  printf("%10lu %10lu running(%3d) -- %p -- %56s -- %p -> %p %s ", (tb-start_time)/1000, (tns/1000), cnt++, kernel, kernels[kernel].c_str(), kernel_inputs[kernel], kernel_outputs[kernel], | 
				
			||||
    (buffers[kernel_outputs[kernel]] != 0) ? "B" : "I"); | 
				
			||||
 | 
				
			||||
  printf("global -- "); | 
				
			||||
  for (int i = 0; i < work_dim; i++) { | 
				
			||||
    printf("%4zu ", global_work_size[i]); | 
				
			||||
  } | 
				
			||||
  printf("local -- "); | 
				
			||||
  for (int i = 0; i < work_dim; i++) { | 
				
			||||
    printf("%4zu ", local_work_size[i]); | 
				
			||||
  } | 
				
			||||
  printf("\n"); | 
				
			||||
 | 
				
			||||
  return ret; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
 | 
				
			||||
cl_mem clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_int *errcode_ret) { | 
				
			||||
  cl_mem (*my_clCreateBuffer)(cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_int *errcode_ret) = NULL; | 
				
			||||
  my_clCreateBuffer = reinterpret_cast<decltype(my_clCreateBuffer)>(dlsym(RTLD_NEXT, "REAL_clCreateBuffer")); | 
				
			||||
 | 
				
			||||
  cl_mem ret = my_clCreateBuffer(context, flags, size, host_ptr, errcode_ret); | 
				
			||||
  buffers[ret] = size; | 
				
			||||
  printf("%p = clCreateBuffer %zu\n", ret, size); | 
				
			||||
  return ret; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
cl_mem clCreateImage(cl_context context, cl_mem_flags flags, const cl_image_format *image_format, const cl_image_desc *image_desc, void *host_ptr, cl_int *errcode_ret) { | 
				
			||||
  cl_mem (*my_clCreateImage)(cl_context context, cl_mem_flags flags, const cl_image_format *image_format, const cl_image_desc *image_desc, void *host_ptr, cl_int *errcode_ret) = NULL; | 
				
			||||
  my_clCreateImage = reinterpret_cast<decltype(my_clCreateImage)>(dlsym(RTLD_NEXT, "REAL_clCreateImage")); | 
				
			||||
 | 
				
			||||
  // SNPE only uses this
 | 
				
			||||
  assert(CL_MEM_OBJECT_IMAGE2D == image_desc->image_type); | 
				
			||||
 | 
				
			||||
  // RGBA, HALF FLOAT
 | 
				
			||||
  assert(CL_RGBA == image_format->image_channel_order); | 
				
			||||
  assert(CL_HALF_FLOAT == image_format->image_channel_data_type); | 
				
			||||
 | 
				
			||||
  map<cl_mem_object_type, string> lc = { | 
				
			||||
    {CL_MEM_OBJECT_BUFFER, "CL_MEM_OBJECT_BUFFER"}, | 
				
			||||
    {CL_MEM_OBJECT_IMAGE2D, "CL_MEM_OBJECT_IMAGE2D"},  // all this one
 | 
				
			||||
    {CL_MEM_OBJECT_IMAGE3D, "CL_MEM_OBJECT_IMAGE3D"}, | 
				
			||||
    {CL_MEM_OBJECT_IMAGE2D_ARRAY, "CL_MEM_OBJECT_IMAGE2D_ARRAY"}, | 
				
			||||
    {CL_MEM_OBJECT_IMAGE1D, "CL_MEM_OBJECT_IMAGE1D"}, | 
				
			||||
    {CL_MEM_OBJECT_IMAGE1D_ARRAY, "CL_MEM_OBJECT_IMAGE1D_ARRAY"}, | 
				
			||||
    {CL_MEM_OBJECT_IMAGE1D_BUFFER, "CL_MEM_OBJECT_IMAGE1D_BUFFER"}}; | 
				
			||||
 | 
				
			||||
  assert(image_desc->image_depth == 0); | 
				
			||||
  assert(image_desc->image_array_size == 0); | 
				
			||||
  assert(image_desc->image_slice_pitch == 0); | 
				
			||||
  //assert(image_desc->image_width * image_desc->image_height * 2 == image_desc->image_row_pitch);
 | 
				
			||||
  
 | 
				
			||||
  image img; | 
				
			||||
  img.image_width = image_desc->image_width; | 
				
			||||
  img.image_height = image_desc->image_height; | 
				
			||||
  img.image_row_pitch = image_desc->image_row_pitch; | 
				
			||||
  img.buffer = image_desc->buffer; | 
				
			||||
 | 
				
			||||
  cl_mem ret = my_clCreateImage(context, flags, image_format, image_desc, host_ptr, errcode_ret); | 
				
			||||
  printf("%p = clCreateImage %s -- %p -- %d %d -- %4zu x %4zu x %4zu -- %4zu %4zu %4zu\n", ret, lc[image_desc->image_type].c_str(), | 
				
			||||
    image_desc->buffer, | 
				
			||||
    image_format->image_channel_order, image_format->image_channel_data_type, | 
				
			||||
    image_desc->image_width, image_desc->image_height, image_desc->image_depth, | 
				
			||||
    image_desc->image_array_size, image_desc->image_row_pitch, image_desc->image_slice_pitch | 
				
			||||
  ); | 
				
			||||
  images[ret] = img; | 
				
			||||
  return ret; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
cl_int clWaitForEvents(cl_uint num_events, const cl_event *event_list) { | 
				
			||||
  cl_int (*my_clWaitForEvents)(cl_uint num_events, const cl_event *event_list); | 
				
			||||
  my_clWaitForEvents = reinterpret_cast<decltype(my_clWaitForEvents)>(dlsym(RTLD_NEXT, "REAL_clWaitForEvents")); | 
				
			||||
  printf("clWaitForEvents\n"); | 
				
			||||
  return my_clWaitForEvents(num_events, event_list); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
cl_int clReleaseEvent(cl_event event) { | 
				
			||||
  cl_int (*my_clReleaseEvent)(cl_event event); | 
				
			||||
  my_clReleaseEvent = reinterpret_cast<decltype(my_clReleaseEvent)>(dlsym(RTLD_NEXT, "REAL_clReleaseEvent")); | 
				
			||||
  printf("clReleaseEvent: %p\n", event); | 
				
			||||
  return my_clReleaseEvent(event); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
/*size_t total = 0;
 | 
				
			||||
 | 
				
			||||
void *calloc(size_t num, size_t size) { | 
				
			||||
  void *(*my_calloc)(size_t num, size_t size); | 
				
			||||
  my_calloc = reinterpret_cast<decltype(my_calloc)>(dlsym(RTLD_NEXT, "REAL_calloc")); | 
				
			||||
 | 
				
			||||
  void *ret = my_calloc(num, size); | 
				
			||||
 | 
				
			||||
  if (do_print) { | 
				
			||||
    total += num*size; | 
				
			||||
    printf("calloc %p -- total:0x%zx -- num:0x%zx size:0x%zx\n", ret, total, num, size); | 
				
			||||
  } | 
				
			||||
  return ret; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
void free(void *ptr) { | 
				
			||||
  void (*my_free)(void *ptr); | 
				
			||||
  my_free = reinterpret_cast<decltype(my_free)>(dlsym(RTLD_NEXT, "REAL_free")); | 
				
			||||
 | 
				
			||||
  if (do_print) { | 
				
			||||
    //printf("free: %p\n", ptr);
 | 
				
			||||
  } else { | 
				
			||||
    my_free(ptr); | 
				
			||||
  } | 
				
			||||
}*/ | 
				
			||||
 | 
				
			||||
void *dlsym(void *handle, const char *symbol) { | 
				
			||||
  void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen-0x2d4); | 
				
			||||
  if (memcmp("REAL_", symbol, 5) == 0) { | 
				
			||||
    return my_dlsym(handle, symbol+5); | 
				
			||||
  } else if (strcmp("clCreateProgramWithSource", symbol) == 0) { | 
				
			||||
    return (void*)clCreateProgramWithSource; | 
				
			||||
  } else if (strcmp("clCreateKernel", symbol) == 0) { | 
				
			||||
    return (void*)clCreateKernel; | 
				
			||||
  } else if (strcmp("clEnqueueNDRangeKernel", symbol) == 0) { | 
				
			||||
    return (void*)clEnqueueNDRangeKernel; | 
				
			||||
  } else if (strcmp("clSetKernelArg", symbol) == 0) { | 
				
			||||
    return (void*)clSetKernelArg; | 
				
			||||
  } else if (strcmp("clCreateBuffer", symbol) == 0) { | 
				
			||||
    return (void*)clCreateBuffer; | 
				
			||||
  } else if (strcmp("clCreateImage", symbol) == 0) { | 
				
			||||
    return (void*)clCreateImage; | 
				
			||||
  /*} else if (strcmp("clReleaseEvent", symbol) == 0) {
 | 
				
			||||
    return (void*)clReleaseEvent; | 
				
			||||
  } else if (strcmp("clWaitForEvents", symbol) == 0) { | 
				
			||||
    return (void*)clWaitForEvents;*/ | 
				
			||||
  } else { | 
				
			||||
    //printf("dlsym %s\n", symbol);
 | 
				
			||||
    return my_dlsym(handle, symbol); | 
				
			||||
  } | 
				
			||||
} | 
				
			||||
 | 
				
			||||
int main(int argc, char* argv[]) { | 
				
			||||
  int err; | 
				
			||||
  cl_platform_id platform_id = NULL; | 
				
			||||
  cl_device_id device_id = NULL; | 
				
			||||
  cl_uint num_devices; | 
				
			||||
  cl_uint num_platforms; | 
				
			||||
 | 
				
			||||
  start_time = nanos_since_boot(); | 
				
			||||
 | 
				
			||||
  err = clGetPlatformIDs(1, &platform_id, &num_platforms); | 
				
			||||
  assert(err == 0); | 
				
			||||
  err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &num_devices); | 
				
			||||
  assert(err == 0); | 
				
			||||
 | 
				
			||||
  cl_uint tmp; | 
				
			||||
 | 
				
			||||
  // sweet this is 64!
 | 
				
			||||
  err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_WRITE_IMAGE_ARGS, sizeof(tmp), &tmp, NULL); | 
				
			||||
  assert(err == 0); | 
				
			||||
  printf("CL_DEVICE_MAX_WRITE_IMAGE_ARGS: %u\n", tmp); | 
				
			||||
 | 
				
			||||
  err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_READ_IMAGE_ARGS, sizeof(tmp), &tmp, NULL); | 
				
			||||
  assert(err == 0); | 
				
			||||
  printf("CL_DEVICE_MAX_READ_IMAGE_ARGS: %u\n", tmp); | 
				
			||||
 | 
				
			||||
  float *output = (float*)calloc(0x10000, sizeof(float)); | 
				
			||||
  SNPEModel mdl(argv[1], output, 0, USE_GPU_RUNTIME); | 
				
			||||
 | 
				
			||||
  float state[TEMPORAL_SIZE]; | 
				
			||||
  mdl.addRecurrent(state, TEMPORAL_SIZE); | 
				
			||||
 | 
				
			||||
  float desire[DESIRE_LEN]; | 
				
			||||
  mdl.addDesire(desire, DESIRE_LEN); | 
				
			||||
 | 
				
			||||
  float traffic_convention[TRAFFIC_CONVENTION_LEN]; | 
				
			||||
  mdl.addTrafficConvention(traffic_convention, TRAFFIC_CONVENTION_LEN); | 
				
			||||
 | 
				
			||||
  float *input = (float*)calloc(0x1000000, sizeof(float));; | 
				
			||||
  printf("************** execute 1 **************\n"); | 
				
			||||
  printf("%p %p %p %p -> %p\n", input, state, desire, traffic_convention, output); | 
				
			||||
  run_num = 1; ioctl_num = 0; | 
				
			||||
  do_print = 0; | 
				
			||||
  start_time = nanos_since_boot(); | 
				
			||||
  mdl.execute(input, 0); | 
				
			||||
  printf("************** execute 2 **************\n"); | 
				
			||||
  run_num = 2; ioctl_num = 0; | 
				
			||||
  do_print = 0; | 
				
			||||
  mdl.execute(input, 0); | 
				
			||||
  printf("************** execute 3 **************\n"); | 
				
			||||
  run_num = 3; ioctl_num = 0; | 
				
			||||
 | 
				
			||||
  do_print = 1; | 
				
			||||
  start_time = nanos_since_boot(); | 
				
			||||
  mdl.execute(input, 0); | 
				
			||||
  do_print = 0; | 
				
			||||
 | 
				
			||||
  struct kgsl_gpuobj_alloc alloc; | 
				
			||||
  memset(&alloc, 0, sizeof(alloc)); | 
				
			||||
  alloc.size = 0x40000; | 
				
			||||
  alloc.flags = 0x10000a00; | 
				
			||||
  int fd = 3; | 
				
			||||
  int ret = ioctl(fd, IOCTL_KGSL_GPUOBJ_ALLOC, &alloc); | 
				
			||||
  void *addr = mmap64(NULL, alloc.mmapsize, 0x3, 0x1, fd, alloc.id*0x1000); | 
				
			||||
  assert(addr != MAP_FAILED); | 
				
			||||
 | 
				
			||||
  intercept = 0; | 
				
			||||
  while (1) { | 
				
			||||
    printf("************** execute 4 **************\n"); | 
				
			||||
    run_num = 4; | 
				
			||||
    base = (uint64_t)addr; | 
				
			||||
 | 
				
			||||
    uint64_t tb = nanos_since_boot(); | 
				
			||||
    int i = 0; | 
				
			||||
    for (auto it = queue_cmds.begin(); it != queue_cmds.end(); ++it) { | 
				
			||||
      printf("run %2d: ", i++); | 
				
			||||
      //(*it)->exec(i == queue_cmds.size());
 | 
				
			||||
      (*it)->exec(true); | 
				
			||||
    } | 
				
			||||
    uint64_t te = nanos_since_boot(); | 
				
			||||
    printf("model exec in %lu us\n", (te-tb)/1000); | 
				
			||||
 | 
				
			||||
    break; | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  /*FILE *f = fopen("/proc/self/maps", "rb");
 | 
				
			||||
  char maps[0x100000]; | 
				
			||||
  int len = fread(maps, 1, sizeof(maps), f); | 
				
			||||
  maps[len] = '\0'; | 
				
			||||
  fclose(f); | 
				
			||||
  printf("%s\n", maps);*/ | 
				
			||||
  
 | 
				
			||||
  printf("buffers: %lu images: %lu\n", buffers.size(), images.size()); | 
				
			||||
  printf("queues: %lu\n", queue_cmds.size()); | 
				
			||||
 | 
				
			||||
  // IOCTL_KGSL_GPU_COMMAND: flags: 0x11 numcmds: 2   numobjs: 1  numsyncs: 0   context_id: 7  timestamp: 77
 | 
				
			||||
  /*int ts = 100;
 | 
				
			||||
  for (auto it = queue_cmds.begin(); it != queue_cmds.end(); ++it) { | 
				
			||||
    auto qcmd = *it; | 
				
			||||
    //disassemble((uint32_t *)qcmd.data(), qcmd.size()/4);
 | 
				
			||||
 | 
				
			||||
    struct kgsl_command_object cmdlists[2]; | 
				
			||||
    struct kgsl_command_object objlists; | 
				
			||||
    struct kgsl_gpu_command cmd; | 
				
			||||
    uint8_t objs[0xc0]; | 
				
			||||
    memset(objs, 0, 0xc0); | 
				
			||||
 | 
				
			||||
    memset(&cmd, 0, sizeof(cmd)); | 
				
			||||
    memset(&cmdlists, 0, sizeof(struct kgsl_command_object)*2); | 
				
			||||
    memset(&objlists, 0, sizeof(objlists)); | 
				
			||||
 | 
				
			||||
    cmd.flags = 0x11; | 
				
			||||
    cmd.cmdlist = (uint64_t)cmdlists; | 
				
			||||
    cmd.numcmds = 2; | 
				
			||||
    cmd.objlist = (uint64_t)objlists; | 
				
			||||
    cmd.numobjs = 1; | 
				
			||||
    cmd.numsyncs = 0; | 
				
			||||
    cmd.context_id = 7; | 
				
			||||
    cmd.timestamp = ts++; | 
				
			||||
 | 
				
			||||
    cmdlists[0].gpuaddr = (uint64_t)queue_init; | 
				
			||||
    cmdlists[0].size = 0xbc; | 
				
			||||
    cmdlists[0].flags = 1; | 
				
			||||
    cmdlists[1].gpuaddr = (uint64_t)qcmd.data(); | 
				
			||||
    cmdlists[1].size = qcmd.size(); | 
				
			||||
    cmdlists[1].flags = 1; | 
				
			||||
 | 
				
			||||
    objlists.gpuaddr = (uint64_t)objs; | 
				
			||||
    objlists.size = 0xc0; | 
				
			||||
    objlists.flags = 0x18; | 
				
			||||
  }*/ | 
				
			||||
} | 
				
			||||
 | 
				
			||||
@ -0,0 +1,95 @@ | 
				
			||||
#include "../thneed.h" | 
				
			||||
#include "../../runners/snpemodel.h" | 
				
			||||
 | 
				
			||||
#define TEMPORAL_SIZE 512 | 
				
			||||
#define DESIRE_LEN 8 | 
				
			||||
#define TRAFFIC_CONVENTION_LEN 2 | 
				
			||||
 | 
				
			||||
void hexdump(uint32_t *d, int len); | 
				
			||||
 | 
				
			||||
int main(int argc, char* argv[]) { | 
				
			||||
  float *output = (float*)calloc(0x10000, sizeof(float)); | 
				
			||||
  float *golden = (float*)calloc(0x10000, sizeof(float)); | 
				
			||||
  SNPEModel mdl(argv[1], output, 0, USE_GPU_RUNTIME); | 
				
			||||
 | 
				
			||||
  // cmd line test
 | 
				
			||||
  if (argc > 2) { | 
				
			||||
    for (int i = 2; i < argc; i++) { | 
				
			||||
      float *buf[5]; | 
				
			||||
      FILE *f = fopen(argv[i], "rb"); | 
				
			||||
 | 
				
			||||
      size_t sz; | 
				
			||||
      for (int j = 0; j < 5; j++) { | 
				
			||||
        fread(&sz, 1, sizeof(sz), f); | 
				
			||||
        printf("reading %zu\n", sz); | 
				
			||||
        buf[j] = (float*)malloc(sz); | 
				
			||||
        fread(buf[j], 1, sz, f); | 
				
			||||
      } | 
				
			||||
 | 
				
			||||
      if (sz != 9532) continue; | 
				
			||||
 | 
				
			||||
      mdl.addRecurrent(buf[0], TEMPORAL_SIZE); | 
				
			||||
      mdl.addTrafficConvention(buf[1], TRAFFIC_CONVENTION_LEN); | 
				
			||||
      mdl.addDesire(buf[2], DESIRE_LEN); | 
				
			||||
      mdl.execute(buf[3], 0); | 
				
			||||
 | 
				
			||||
      hexdump((uint32_t*)buf[4], 0x100); | 
				
			||||
      hexdump((uint32_t*)output, 0x100); | 
				
			||||
 | 
				
			||||
      for (int j = 0; j < sz/4; j++) { | 
				
			||||
        if (buf[4][j] != output[j]) { | 
				
			||||
          printf("MISMATCH %d real:%f comp:%f\n", j, buf[4][j], output[j]); | 
				
			||||
        } | 
				
			||||
      } | 
				
			||||
    } | 
				
			||||
 | 
				
			||||
    return 0; | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  float state[TEMPORAL_SIZE]; | 
				
			||||
  mdl.addRecurrent(state, TEMPORAL_SIZE); | 
				
			||||
 | 
				
			||||
  float desire[DESIRE_LEN]; | 
				
			||||
  mdl.addDesire(desire, DESIRE_LEN); | 
				
			||||
 | 
				
			||||
  float traffic_convention[TRAFFIC_CONVENTION_LEN]; | 
				
			||||
  mdl.addTrafficConvention(traffic_convention, TRAFFIC_CONVENTION_LEN); | 
				
			||||
 | 
				
			||||
  float *input = (float*)calloc(0x1000000, sizeof(float));; | 
				
			||||
 | 
				
			||||
  // first run
 | 
				
			||||
  printf("************** execute 1 **************\n"); | 
				
			||||
  memset(output, 0, sizeof(output)); | 
				
			||||
  mdl.execute(input, 0); | 
				
			||||
  hexdump((uint32_t *)output, 0x100); | 
				
			||||
  memcpy(golden, output, sizeof(output)); | 
				
			||||
 | 
				
			||||
  // second run
 | 
				
			||||
  printf("************** execute 2 **************\n"); | 
				
			||||
  memset(output, 0, sizeof(output)); | 
				
			||||
  Thneed *t = new Thneed(); | 
				
			||||
  t->record = 3;  // debug print with record
 | 
				
			||||
  mdl.execute(input, 0); | 
				
			||||
  t->stop(); | 
				
			||||
  hexdump((uint32_t *)output, 0x100); | 
				
			||||
  if (memcmp(golden, output, sizeof(output)) != 0) { printf("FAILURE\n"); return -1; } | 
				
			||||
 | 
				
			||||
  // third run
 | 
				
			||||
  printf("************** execute 3 **************\n"); | 
				
			||||
  memset(output, 0, sizeof(output)); | 
				
			||||
  t->record = 2;  // debug print w/o record
 | 
				
			||||
  float *inputs[4] = {state, traffic_convention, desire, input}; | 
				
			||||
  t->execute(inputs, output); | 
				
			||||
  hexdump((uint32_t *)output, 0x100); | 
				
			||||
  if (memcmp(golden, output, sizeof(output)) != 0) { printf("FAILURE\n"); return -1; } | 
				
			||||
 | 
				
			||||
  printf("************** execute 4 **************\n"); | 
				
			||||
  memset(output, 0, sizeof(output)); | 
				
			||||
  //t->record = 2;  // debug print w/o record
 | 
				
			||||
  t->execute(inputs, output); | 
				
			||||
  hexdump((uint32_t *)output, 0x100); | 
				
			||||
  if (memcmp(golden, output, sizeof(output)) != 0) { printf("FAILURE\n"); return -1; } | 
				
			||||
 | 
				
			||||
  printf("************** execute done **************\n"); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
@ -0,0 +1,4 @@ | 
				
			||||
#!/bin/sh | 
				
			||||
export LD_LIBRARY_PATH="/data/openpilot/phonelibs/snpe/aarch64/:$HOME/openpilot/phonelibs/snpe/larch64:$HOME/openpilot/phonelibs/snpe/x86_64-linux-clang:$LD_LIBRARY_PATH" | 
				
			||||
exec ./_thneed $@ | 
				
			||||
 | 
				
			||||
									
										
											File diff suppressed because it is too large
											Load Diff
										
									
								
							
						@ -0,0 +1,363 @@ | 
				
			||||
#include "thneed.h" | 
				
			||||
#include <cassert> | 
				
			||||
#include <sys/mman.h> | 
				
			||||
#include <dlfcn.h> | 
				
			||||
#include <map> | 
				
			||||
#include <string> | 
				
			||||
#include <errno.h> | 
				
			||||
 | 
				
			||||
Thneed *g_thneed = NULL; | 
				
			||||
int g_fd = -1; | 
				
			||||
std::map<std::pair<cl_kernel, int>, std::string> g_args; | 
				
			||||
 | 
				
			||||
static inline uint64_t nanos_since_boot() { | 
				
			||||
  struct timespec t; | 
				
			||||
  clock_gettime(CLOCK_BOOTTIME, &t); | 
				
			||||
  return t.tv_sec * 1000000000ULL + t.tv_nsec; } | 
				
			||||
 | 
				
			||||
void hexdump(uint32_t *d, int len) { | 
				
			||||
  assert((len%4) == 0); | 
				
			||||
  printf("  dumping %p len 0x%x\n", d, len); | 
				
			||||
  for (int i = 0; i < len/4; i++) { | 
				
			||||
    if (i != 0 && (i%0x10) == 0) printf("\n"); | 
				
			||||
    printf("%8x ", d[i]); | 
				
			||||
  } | 
				
			||||
  printf("\n"); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
extern "C" { | 
				
			||||
 | 
				
			||||
int (*my_ioctl)(int filedes, unsigned long request, void *argp) = NULL; | 
				
			||||
#undef ioctl | 
				
			||||
int ioctl(int filedes, unsigned long request, void *argp) { | 
				
			||||
  if (my_ioctl == NULL) my_ioctl = reinterpret_cast<decltype(my_ioctl)>(dlsym(RTLD_NEXT, "ioctl")); | 
				
			||||
  Thneed *thneed = g_thneed; | 
				
			||||
 | 
				
			||||
  // save the fd
 | 
				
			||||
  if (request == IOCTL_KGSL_GPUOBJ_ALLOC) g_fd = filedes; | 
				
			||||
 | 
				
			||||
  if (thneed != NULL) { | 
				
			||||
    if (request == IOCTL_KGSL_GPU_COMMAND) { | 
				
			||||
      struct kgsl_gpu_command *cmd = (struct kgsl_gpu_command *)argp; | 
				
			||||
      if (thneed->record & 1) { | 
				
			||||
        thneed->timestamp = cmd->timestamp; | 
				
			||||
        thneed->context_id = cmd->context_id; | 
				
			||||
        CachedCommand *ccmd = new CachedCommand(thneed, cmd); | 
				
			||||
        thneed->cmds.push_back(ccmd); | 
				
			||||
      } | 
				
			||||
      if (thneed->record & 2) { | 
				
			||||
        printf("IOCTL_KGSL_GPU_COMMAND: flags: 0x%lx    context_id: %u  timestamp: %u\n", | 
				
			||||
            cmd->flags, | 
				
			||||
            cmd->context_id, cmd->timestamp); | 
				
			||||
      } | 
				
			||||
    } else if (request == IOCTL_KGSL_GPUOBJ_SYNC) { | 
				
			||||
      struct kgsl_gpuobj_sync *cmd = (struct kgsl_gpuobj_sync *)argp; | 
				
			||||
      struct kgsl_gpuobj_sync_obj *objs = (struct kgsl_gpuobj_sync_obj *)(cmd->objs); | 
				
			||||
 | 
				
			||||
      if (thneed->record & 2) { | 
				
			||||
        printf("IOCTL_KGSL_GPUOBJ_SYNC count:%d ", cmd->count); | 
				
			||||
        for (int i = 0; i < cmd->count; i++) { | 
				
			||||
          printf(" -- offset:0x%lx len:0x%lx id:%d op:%d  ", objs[i].offset, objs[i].length, objs[i].id, objs[i].op); | 
				
			||||
        } | 
				
			||||
        printf("\n"); | 
				
			||||
      } | 
				
			||||
 | 
				
			||||
      if (thneed->record & 1) { | 
				
			||||
        struct kgsl_gpuobj_sync_obj *new_objs = (struct kgsl_gpuobj_sync_obj *)malloc(sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count); | 
				
			||||
        memcpy(new_objs, objs, sizeof(struct kgsl_gpuobj_sync_obj)*cmd->count); | 
				
			||||
        thneed->syncobjs.push_back(std::make_pair(cmd->count, new_objs)); | 
				
			||||
      } | 
				
			||||
    } else if (request == IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID) { | 
				
			||||
      struct kgsl_device_waittimestamp_ctxtid *cmd = (struct kgsl_device_waittimestamp_ctxtid *)argp; | 
				
			||||
      if (thneed->record & 2) { | 
				
			||||
        printf("IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID: context_id: %d  timestamp: %d  timeout: %d\n", | 
				
			||||
            cmd->context_id, cmd->timestamp, cmd->timeout); | 
				
			||||
      } | 
				
			||||
    } else if (request == IOCTL_KGSL_SETPROPERTY) { | 
				
			||||
      if (thneed->record & 2) { | 
				
			||||
        struct kgsl_device_getproperty *prop = (struct kgsl_device_getproperty *)argp; | 
				
			||||
        printf("IOCTL_KGSL_SETPROPERTY: 0x%x sizebytes:%zu\n", prop->type, prop->sizebytes); | 
				
			||||
        if (thneed->record & 4) { | 
				
			||||
          hexdump((uint32_t *)prop->value, prop->sizebytes); | 
				
			||||
          if (prop->type == KGSL_PROP_PWR_CONSTRAINT) { | 
				
			||||
            struct kgsl_device_constraint *constraint = (struct kgsl_device_constraint *)prop->value; | 
				
			||||
            hexdump((uint32_t *)constraint->data, constraint->size); | 
				
			||||
          } | 
				
			||||
        } | 
				
			||||
      } | 
				
			||||
    } | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  int ret = my_ioctl(filedes, request, argp); | 
				
			||||
  if (ret != 0) printf("ioctl returned %d with errno %d\n", ret, errno); | 
				
			||||
  return ret; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
} | 
				
			||||
 | 
				
			||||
GPUMalloc::GPUMalloc(int size, int fd) { | 
				
			||||
  struct kgsl_gpuobj_alloc alloc; | 
				
			||||
  memset(&alloc, 0, sizeof(alloc)); | 
				
			||||
  alloc.size = size; | 
				
			||||
  alloc.flags = 0x10000a00; | 
				
			||||
  int ret = ioctl(fd, IOCTL_KGSL_GPUOBJ_ALLOC, &alloc); | 
				
			||||
  void *addr = mmap64(NULL, alloc.mmapsize, 0x3, 0x1, fd, alloc.id*0x1000); | 
				
			||||
  assert(addr != MAP_FAILED); | 
				
			||||
 | 
				
			||||
  base = (uint64_t)addr; | 
				
			||||
  remaining = size; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
void *GPUMalloc::alloc(int size) { | 
				
			||||
  if (size > remaining) return NULL; | 
				
			||||
  remaining -= size; | 
				
			||||
  void *ret = (void*)base; | 
				
			||||
  base += (size+0xff) & (~0xFF); | 
				
			||||
  return ret; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
CachedCommand::CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd) { | 
				
			||||
  thneed = lthneed; | 
				
			||||
  assert(cmd->numcmds == 2); | 
				
			||||
  assert(cmd->numobjs == 1); | 
				
			||||
  assert(cmd->numsyncs == 0); | 
				
			||||
 | 
				
			||||
  memcpy(cmds, (void *)cmd->cmdlist, sizeof(struct kgsl_command_object)*2); | 
				
			||||
  memcpy(objs, (void *)cmd->objlist, sizeof(struct kgsl_command_object)*1); | 
				
			||||
 | 
				
			||||
  memcpy(&cache, cmd, sizeof(cache)); | 
				
			||||
  cache.cmdlist = (uint64_t)cmds; | 
				
			||||
  cache.objlist = (uint64_t)objs; | 
				
			||||
 | 
				
			||||
  for (int i = 0; i < cmd->numcmds; i++) { | 
				
			||||
    void *nn = thneed->ram->alloc(cmds[i].size); | 
				
			||||
    memcpy(nn, (void*)cmds[i].gpuaddr, cmds[i].size); | 
				
			||||
    cmds[i].gpuaddr = (uint64_t)nn; | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  for (int i = 0; i < cmd->numobjs; i++) { | 
				
			||||
    void *nn = thneed->ram->alloc(objs[i].size); | 
				
			||||
    memset(nn, 0, objs[i].size); | 
				
			||||
    objs[i].gpuaddr = (uint64_t)nn; | 
				
			||||
  } | 
				
			||||
} | 
				
			||||
 | 
				
			||||
void CachedCommand::exec(bool wait) { | 
				
			||||
  cache.timestamp = ++thneed->timestamp; | 
				
			||||
  int ret = ioctl(thneed->fd, IOCTL_KGSL_GPU_COMMAND, &cache); | 
				
			||||
 | 
				
			||||
  if (wait) { | 
				
			||||
    struct kgsl_device_waittimestamp_ctxtid wait; | 
				
			||||
    wait.context_id = cache.context_id; | 
				
			||||
    wait.timestamp = cache.timestamp; | 
				
			||||
    wait.timeout = -1; | 
				
			||||
 | 
				
			||||
    uint64_t tb = nanos_since_boot(); | 
				
			||||
    int wret = ioctl(thneed->fd, IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID, &wait); | 
				
			||||
    uint64_t te = nanos_since_boot(); | 
				
			||||
 | 
				
			||||
    if (thneed->record & 2) printf("exec %d wait %d after %lu us\n", ret, wret, (te-tb)/1000); | 
				
			||||
  } else { | 
				
			||||
    if (thneed->record & 2) printf("CachedCommand::exec got %d\n", ret); | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  assert(ret == 0); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
Thneed::Thneed() { | 
				
			||||
  assert(g_fd != -1); | 
				
			||||
  fd = g_fd; | 
				
			||||
  ram = new GPUMalloc(0x40000, fd); | 
				
			||||
  record = 1; | 
				
			||||
  timestamp = -1; | 
				
			||||
  g_thneed = this; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
void Thneed::stop() { | 
				
			||||
  record = 0; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
//#define SAVE_LOG
 | 
				
			||||
 | 
				
			||||
void Thneed::execute(float **finputs, float *foutput) { | 
				
			||||
  #ifdef SAVE_LOG | 
				
			||||
    char fn[0x100]; | 
				
			||||
    snprintf(fn, sizeof(fn), "/tmp/thneed_log_%d", timestamp); | 
				
			||||
    FILE *f = fopen(fn, "wb"); | 
				
			||||
  #endif | 
				
			||||
 | 
				
			||||
  // ****** copy inputs
 | 
				
			||||
  for (int idx = 0; idx < inputs.size(); ++idx) { | 
				
			||||
    size_t sz; | 
				
			||||
    clGetMemObjectInfo(inputs[idx], CL_MEM_SIZE, sizeof(sz), &sz, NULL); | 
				
			||||
 | 
				
			||||
    #ifdef SAVE_LOG | 
				
			||||
      fwrite(&sz, 1, sizeof(sz), f); | 
				
			||||
      fwrite(finputs[idx], 1, sz, f); | 
				
			||||
    #endif | 
				
			||||
 | 
				
			||||
    if (record & 2) printf("copying %lu -- %p -> %p\n", sz, finputs[idx], inputs[idx]); | 
				
			||||
    clEnqueueWriteBuffer(command_queue, inputs[idx], CL_TRUE, 0, sz, finputs[idx], 0, NULL, NULL); | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // ****** set power constraint
 | 
				
			||||
  struct kgsl_device_constraint_pwrlevel pwrlevel; | 
				
			||||
  pwrlevel.level = KGSL_CONSTRAINT_PWR_MAX; | 
				
			||||
 | 
				
			||||
  struct kgsl_device_constraint constraint; | 
				
			||||
  constraint.type = KGSL_CONSTRAINT_PWRLEVEL; | 
				
			||||
  constraint.context_id = context_id; | 
				
			||||
  constraint.data = (void*)&pwrlevel; | 
				
			||||
  constraint.size = sizeof(pwrlevel); | 
				
			||||
 | 
				
			||||
  struct kgsl_device_getproperty prop; | 
				
			||||
  prop.type = KGSL_PROP_PWR_CONSTRAINT; | 
				
			||||
  prop.value = (void*)&constraint; | 
				
			||||
  prop.sizebytes = sizeof(constraint); | 
				
			||||
  int ret = ioctl(fd, IOCTL_KGSL_SETPROPERTY, &prop); | 
				
			||||
  assert(ret == 0); | 
				
			||||
 | 
				
			||||
  // ****** run commands
 | 
				
			||||
  int i = 0; | 
				
			||||
  for (auto it = cmds.begin(); it != cmds.end(); ++it) { | 
				
			||||
    if (record & 2) printf("run %2d: ", i); | 
				
			||||
    (*it)->exec((++i) == cmds.size()); | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // ****** sync objects
 | 
				
			||||
  for (auto it = syncobjs.begin(); it != syncobjs.end(); ++it) { | 
				
			||||
    struct kgsl_gpuobj_sync cmd; | 
				
			||||
 | 
				
			||||
    cmd.objs = (uint64_t)it->second; | 
				
			||||
    cmd.obj_len = it->first * sizeof(struct kgsl_gpuobj_sync_obj); | 
				
			||||
    cmd.count = it->first; | 
				
			||||
 | 
				
			||||
    ret = ioctl(fd, IOCTL_KGSL_GPUOBJ_SYNC, &cmd); | 
				
			||||
    assert(ret == 0); | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  // ****** copy outputs
 | 
				
			||||
  size_t sz; | 
				
			||||
  clGetMemObjectInfo(output, CL_MEM_SIZE, sizeof(sz), &sz, NULL); | 
				
			||||
  if (record & 2) printf("copying %lu for output %p -> %p\n", sz, output, foutput); | 
				
			||||
  clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, sz, foutput, 0, NULL, NULL); | 
				
			||||
 | 
				
			||||
  #ifdef SAVE_LOG | 
				
			||||
    fwrite(&sz, 1, sizeof(sz), f); | 
				
			||||
    fwrite(foutput, 1, sz, f); | 
				
			||||
    fclose(f); | 
				
			||||
  #endif | 
				
			||||
 | 
				
			||||
  // ****** unset power constraint
 | 
				
			||||
  constraint.type = KGSL_CONSTRAINT_NONE; | 
				
			||||
  constraint.data = NULL; | 
				
			||||
  constraint.size = 0; | 
				
			||||
 | 
				
			||||
  ret = ioctl(fd, IOCTL_KGSL_SETPROPERTY, &prop); | 
				
			||||
  assert(ret == 0); | 
				
			||||
} | 
				
			||||
 | 
				
			||||
cl_int (*my_clSetKernelArg)(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) = NULL; | 
				
			||||
cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) { | 
				
			||||
  if (my_clSetKernelArg == NULL) my_clSetKernelArg = reinterpret_cast<decltype(my_clSetKernelArg)>(dlsym(RTLD_NEXT, "REAL_clSetKernelArg")); | 
				
			||||
  if (arg_value != NULL) { | 
				
			||||
    g_args[std::make_pair(kernel, arg_index)] = std::string((char*)arg_value, arg_size); | 
				
			||||
  } | 
				
			||||
  cl_int ret = my_clSetKernelArg(kernel, arg_index, arg_size, arg_value); | 
				
			||||
  return ret; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
cl_int (*my_clEnqueueNDRangeKernel)(cl_command_queue, cl_kernel, cl_uint, const size_t *, const size_t *, const size_t *, cl_uint, const cl_event *, cl_event *) = NULL; | 
				
			||||
cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue, | 
				
			||||
  cl_kernel kernel, | 
				
			||||
  cl_uint work_dim, | 
				
			||||
  const size_t *global_work_offset, | 
				
			||||
  const size_t *global_work_size, | 
				
			||||
  const size_t *local_work_size, | 
				
			||||
  cl_uint num_events_in_wait_list, | 
				
			||||
  const cl_event *event_wait_list, | 
				
			||||
  cl_event *event) { | 
				
			||||
 | 
				
			||||
  if (my_clEnqueueNDRangeKernel == NULL) my_clEnqueueNDRangeKernel = reinterpret_cast<decltype(my_clEnqueueNDRangeKernel)>(dlsym(RTLD_NEXT, "REAL_clEnqueueNDRangeKernel")); | 
				
			||||
  Thneed *thneed = g_thneed; | 
				
			||||
 | 
				
			||||
  // SNPE doesn't use these
 | 
				
			||||
  assert(num_events_in_wait_list == 0); | 
				
			||||
  assert(global_work_offset == NULL); | 
				
			||||
 | 
				
			||||
  char name[0x100]; | 
				
			||||
  clGetKernelInfo(kernel, CL_KERNEL_FUNCTION_NAME, sizeof(name), name, NULL); | 
				
			||||
 | 
				
			||||
  cl_uint num_args; | 
				
			||||
  clGetKernelInfo(kernel, CL_KERNEL_NUM_ARGS, sizeof(num_args), &num_args, NULL); | 
				
			||||
 | 
				
			||||
  if (thneed != NULL && thneed->record & 1) { | 
				
			||||
    thneed->command_queue = command_queue; | 
				
			||||
    for (int i = 0; i < num_args; i++) { | 
				
			||||
      char arg_name[0x100]; | 
				
			||||
      clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL); | 
				
			||||
      std::string arg = g_args[std::make_pair(kernel, i)]; | 
				
			||||
 | 
				
			||||
      if (strcmp(arg_name, "input") == 0 && strcmp(name, "zero_pad_image_float") == 0) { | 
				
			||||
        cl_mem mem; | 
				
			||||
        memcpy(&mem, (void*)arg.data(), sizeof(mem)); | 
				
			||||
        thneed->inputs.push_back(mem); | 
				
			||||
      } | 
				
			||||
 | 
				
			||||
      if (strcmp(arg_name, "output") == 0 && strcmp(name, "image2d_to_buffer_float") == 0) { | 
				
			||||
        cl_mem mem; | 
				
			||||
        memcpy(&mem, (void*)arg.data(), sizeof(mem)); | 
				
			||||
        thneed->output = mem; | 
				
			||||
      } | 
				
			||||
    } | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  if (thneed != NULL && thneed->record & 4) { | 
				
			||||
    // extreme debug
 | 
				
			||||
    printf("%s -- %p\n", name, kernel); | 
				
			||||
    for (int i = 0; i < num_args; i++) { | 
				
			||||
      char arg_type[0x100]; | 
				
			||||
      char arg_name[0x100]; | 
				
			||||
      clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_TYPE_NAME, sizeof(arg_type), arg_type, NULL); | 
				
			||||
      clGetKernelArgInfo(kernel, i, CL_KERNEL_ARG_NAME, sizeof(arg_name), arg_name, NULL); | 
				
			||||
      std::string arg = g_args[std::make_pair(kernel, i)]; | 
				
			||||
      printf("  %s %s", arg_type, arg_name); | 
				
			||||
      void *arg_value = (void*)arg.data(); | 
				
			||||
      int arg_size = arg.size(); | 
				
			||||
      if (arg_size == 1) { | 
				
			||||
        printf(" = %d", *((char*)arg_value)); | 
				
			||||
      } else if (arg_size == 2) { | 
				
			||||
        printf(" = %d", *((short*)arg_value)); | 
				
			||||
      } else if (arg_size == 4) { | 
				
			||||
        if (strcmp(arg_type, "float") == 0) { | 
				
			||||
          printf(" = %f", *((float*)arg_value)); | 
				
			||||
        } else { | 
				
			||||
          printf(" = %d", *((int*)arg_value)); | 
				
			||||
        } | 
				
			||||
      } else if (arg_size == 8) { | 
				
			||||
        cl_mem val = (cl_mem)(*((uintptr_t*)arg_value)); | 
				
			||||
        printf(" = %p", val); | 
				
			||||
      } | 
				
			||||
      printf("\n"); | 
				
			||||
    } | 
				
			||||
  } | 
				
			||||
 | 
				
			||||
  cl_int ret = my_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, | 
				
			||||
    global_work_offset, global_work_size, local_work_size, | 
				
			||||
    num_events_in_wait_list, event_wait_list, event); | 
				
			||||
  return ret; | 
				
			||||
} | 
				
			||||
 | 
				
			||||
void *dlsym(void *handle, const char *symbol) { | 
				
			||||
  void *(*my_dlsym)(void *handle, const char *symbol) = (void *(*)(void *handle, const char *symbol))((uintptr_t)dlopen-0x2d4); | 
				
			||||
  if (memcmp("REAL_", symbol, 5) == 0) { | 
				
			||||
    return my_dlsym(handle, symbol+5); | 
				
			||||
  } else if (strcmp("clEnqueueNDRangeKernel", symbol) == 0) { | 
				
			||||
    return (void*)clEnqueueNDRangeKernel; | 
				
			||||
  } else if (strcmp("clSetKernelArg", symbol) == 0) { | 
				
			||||
    return (void*)clSetKernelArg; | 
				
			||||
  } else { | 
				
			||||
    return my_dlsym(handle, symbol); | 
				
			||||
  } | 
				
			||||
} | 
				
			||||
 | 
				
			||||
@ -0,0 +1,50 @@ | 
				
			||||
#pragma once | 
				
			||||
 | 
				
			||||
#include <stdint.h> | 
				
			||||
#include "include/msm_kgsl.h" | 
				
			||||
#include <vector> | 
				
			||||
#include <CL/cl.h> | 
				
			||||
 | 
				
			||||
class Thneed; | 
				
			||||
 | 
				
			||||
class GPUMalloc { | 
				
			||||
  public: | 
				
			||||
    GPUMalloc(int size, int fd); | 
				
			||||
    void *alloc(int size); | 
				
			||||
  private: | 
				
			||||
    uint64_t base; | 
				
			||||
    int remaining; | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
class CachedCommand { | 
				
			||||
  public: | 
				
			||||
    CachedCommand(Thneed *lthneed, struct kgsl_gpu_command *cmd); | 
				
			||||
    void exec(bool wait); | 
				
			||||
  private: | 
				
			||||
    struct kgsl_gpu_command cache; | 
				
			||||
    struct kgsl_command_object cmds[2]; | 
				
			||||
    struct kgsl_command_object objs[1]; | 
				
			||||
    Thneed *thneed; | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
class Thneed { | 
				
			||||
  public: | 
				
			||||
    Thneed(); | 
				
			||||
    void stop(); | 
				
			||||
    void execute(float **finputs, float *foutput); | 
				
			||||
 | 
				
			||||
    std::vector<cl_mem> inputs; | 
				
			||||
    cl_mem output; | 
				
			||||
 | 
				
			||||
    cl_command_queue command_queue; | 
				
			||||
    int context_id; | 
				
			||||
 | 
				
			||||
    // protected?
 | 
				
			||||
    int record; | 
				
			||||
    int timestamp; | 
				
			||||
    GPUMalloc *ram; | 
				
			||||
    std::vector<CachedCommand *> cmds; | 
				
			||||
    std::vector<std::pair<int, struct kgsl_gpuobj_sync_obj *> > syncobjs; | 
				
			||||
    int fd; | 
				
			||||
}; | 
				
			||||
 | 
				
			||||
					Loading…
					
					
				
		Reference in new issue