openpilot is an open source driver assistance system. openpilot performs the functions of Automated Lane Centering and Adaptive Cruise Control for over 200 supported car makes and models.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

157 lines
6.1 KiB

import time
from hexdump import hexdump
from tinygrad import Tensor, Device
import tinygrad.runtime.autogen.amd_gpu as amd_gpu
import tinygrad.runtime.autogen.kfd as kfd
import tinygrad.runtime.autogen.hsa as hsa
from tinygrad.engine.schedule import create_schedule
from tinygrad.runtime.ops_amd import kio, AMDProgram
from tinygrad.helpers import to_mv
DISPATCH_INIT_VALUE = 0x21 | 0x8000
#mmCOMPUTE_START_X = 0x2e04
#mmCOMPUTE_PGM_LO = 0x2e0c
BASE_ADDR = 0x00001260
PACKET3_SET_SH_REG_START = 0x2c00
SUB = PACKET3_SET_SH_REG_START - BASE_ADDR
regCOMPUTE_PGM_LO = 0x1bac - SUB
regCOMPUTE_START_X = 0x1ba4 - SUB
regCOMPUTE_NUM_THREAD_X = 0x1ba7 - SUB
regCOMPUTE_USER_DATA_0 = 0x1be0 - SUB
regCOMPUTE_USER_DATA_8 = 0x1be8 - SUB
regCOMPUTE_PGM_RSRC1 = 0x1bb2 - SUB
regCOMPUTE_PGM_RSRC2 = 0x1bb3 - SUB
# DEBUG=6 python3 extra/hip_gpu_driver/test_pm4.py
# sudo umr -i 1 -s amd744c.gfx1100 --sbank 1 1 2 | grep regCOMPUTE
# 0x00009025
COMPUTE_SHADER_EN = 1
USE_THREAD_DIMENSIONS = 1 << 5
CS_W32_EN = 1 << 15
def format_struct(s):
sdats = []
for field_name, field_type in s._fields_:
dat = getattr(s, field_name)
if isinstance(dat, int): sdats.append(f"{field_name}:0x{dat:X}")
else: sdats.append(f"{field_name}:{dat}")
return sdats
if __name__ == "__main__":
dev = Device["KFD"]
a = Tensor([0.,1.,2.], device="KFD").realize()
b = a + 7
b.lazydata.buffer.allocate()
si = create_schedule([b.lazydata])[-1]
runner = dev.get_runner(*si.ast)
prg: AMDProgram = runner.clprg
print("device initted")
# Compute Queue
gart_compute = dev._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
eop_buffer = dev._gpu_alloc(0x1000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
compute_ring = dev._gpu_alloc(0x800000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_GTT, uncached=True)
ctx_save_restore_address = dev._gpu_alloc(0x2C02000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
compute_queue = kio.create_queue(dev.kfd, ring_base_address=compute_ring.va_addr, ring_size=compute_ring.size, gpu_id=dev.gpu_id,
queue_type=kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, queue_percentage=kfd.KFD_MAX_QUEUE_PERCENTAGE, queue_priority=kfd.KFD_MAX_QUEUE_PRIORITY,
#eop_buffer_address=eop_buffer.va_addr, eop_buffer_size=eop_buffer.size,
#ctx_save_restore_address=ctx_save_restore_address.va_addr, ctx_save_restore_size=ctx_save_restore_address.size,
#ctl_stack_size = 0xa000,
write_pointer_address=gart_compute.va_addr, read_pointer_address=gart_compute.va_addr+8)
compute_doorbell = to_mv(dev.doorbells + compute_queue.doorbell_offset - dev.doorbells_base, 4).cast("I")
#scratch = dev._gpu_alloc(0x10000, kfd.KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
ka = to_mv(dev.kernargs_ptr, 0x10).cast("Q")
ka[0] = b.lazydata.buffer._buf.va_addr
ka[1] = a.lazydata.buffer._buf.va_addr
compute_read_pointer = to_mv(compute_queue.read_pointer_address, 8).cast("Q")
compute_write_pointer = to_mv(compute_queue.write_pointer_address, 8).cast("Q")
hexdump(to_mv(prg.handle, 0x40))
code = hsa.amd_kernel_code_t.from_address(prg.handle)
#print(format_struct(code))
#print("code")
#hexdump(to_mv(code_ptr, 0x100))
#runner.local_size = [2,1,1]
print(runner.local_size, runner.global_size)
#pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), mmCOMPUTE_PGM_LO,
# prg.handle&0xFFFFFFFF, prg.handle>>32, 0, 0, (scratch.va_addr>>8)&0xFFFFFFFF, scratch.va_addr>>40]
code_ptr = (prg.handle + code.kernel_code_entry_byte_offset) >> 8
pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 6), regCOMPUTE_PGM_LO, code_ptr&0xFFFFFFFF, code_ptr>>32, 0, 0, 0, 0]
pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_PGM_RSRC1, code.compute_pgm_rsrc1, code.compute_pgm_rsrc2]
pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_USER_DATA_0, dev.kernargs_ptr&0xFFFFFFFF, dev.kernargs_ptr>>32]
#pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 2), regCOMPUTE_USER_DATA_0, 0, 0]
pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), regCOMPUTE_START_X, 0,0,0,
runner.local_size[0],runner.local_size[1],runner.local_size[2],0,0]
# disabled USE_THREAD_DIMENSIONS
pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, 3),
runner.global_size[0],runner.global_size[1],runner.global_size[2], CS_W32_EN | COMPUTE_SHADER_EN]
#pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_NOP, 0x3fff)]*0x200
"""
addr=0x0
sz=(1 << 64)-1
gli=0
glv=0
glk=0
gl1=0
gl2=0
pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0,
sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]
print(pm4_cmd)
"""
wptr = 0
pm4_buffer_view = to_mv(compute_ring.va_addr, compute_ring.size).cast("I")
for j in range(0x80000):
for i, value in enumerate(pm4_cmd): pm4_buffer_view[wptr+i] = value
wptr += len(pm4_cmd)
compute_write_pointer[0] = wptr
compute_doorbell[0] = wptr
for k in range(10):
done = compute_read_pointer[0] == compute_write_pointer[0]
print(compute_read_pointer[0], compute_write_pointer[0], done)
if done: break
time.sleep(0.01)
break
#break
#print(compute_read_pointer[0])
#time.sleep(0.05)
#print(compute_read_pointer[0])
#time.sleep(100)
print(a.numpy())
print(b.numpy())
exit(0)
#pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_SET_SH_REG, 8), mmCOMPUTE_PGM_LO, 0,0,0,1,1,1,0,0]
#pm4_cmd += [amd_gpu.PACKET3(amd_gpu.PACKET3_DISPATCH_DIRECT, )]
#pm4_cmd = [amd_gpu.PACKET3(amd_gpu.PACKET3_ACQUIRE_MEM, 6), 0,
# sz & 0xffffffff, (sz >> 32) & 0xff, addr & 0xffffffff, (addr >> 32) & 0xffffff, 0,
# amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLI_INV(gli) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLK_INV(glk) | \
# amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GLV_INV(glv) | amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL1_INV(gl1) | \
# amd_gpu.PACKET3_ACQUIRE_MEM_GCR_CNTL_GL2_INV(gl2)]