You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
2435 lines
96 KiB
2435 lines
96 KiB
.text
|
|
.amdgcn_target "amdgcn-amd-amdhsa--gfx1100"
|
|
;.amdhsa_code_object_version 5
|
|
.protected kernel ; -- Begin function kernel
|
|
.globl kernel
|
|
.p2align 8
|
|
.type kernel,@function
|
|
kernel: ; @kernel
|
|
; %bb.0: ; %.preheader193
|
|
|
|
;; Init code for matrix A and B buffer Loads - START
|
|
s_load_b128 s[20:23], s[0:1], 0x8 ; Matrix A and B
|
|
s_waitcnt lgkmcnt(0)
|
|
|
|
; Matrix B offsets:
|
|
; input is s[22:23]
|
|
; offset base addresses s[24:39]
|
|
|
|
s_add_u32 s24, s22, 0x0000
|
|
s_addc_u32 s25, s23, 0
|
|
s_add_u32 s26, s22, 0x4000
|
|
s_addc_u32 s27, s23, 0
|
|
s_add_u32 s28, s22, 0x8000
|
|
s_addc_u32 s29, s23, 0
|
|
s_add_u32 s30, s22, 0xc000
|
|
s_addc_u32 s31, s23, 0
|
|
s_add_u32 s32, s22, 0x10000
|
|
s_addc_u32 s33, s23, 0
|
|
s_add_u32 s34, s22, 0x14000
|
|
s_addc_u32 s35, s23, 0
|
|
s_add_u32 s36, s22, 0x18000
|
|
s_addc_u32 s37, s23, 0
|
|
s_add_u32 s38, s22, 0x1c000
|
|
s_addc_u32 s39, s23, 0
|
|
|
|
; compute Matrix B offset
|
|
s_lshl_b32 s19, s14, 7 ; BN * blockIdx.x
|
|
|
|
|
|
v_add_nc_u32_e32 v203, s19, v0 ; index = BN * blockIdx.x + threadIdx.x
|
|
v_lshlrev_b32_e32 v203,2, v203 ; offset = 4*index (VPGR offset in global_load are in bytes when using SPGR addressing)
|
|
|
|
; Matrix A offsets:
|
|
; input is s[20:21]
|
|
; offset base addresses s[40:55]
|
|
s_add_u32 s40, s20, 0x0000
|
|
s_addc_u32 s41, s21, 0
|
|
s_add_u32 s42, s20, 0x40000
|
|
s_addc_u32 s43, s21, 0
|
|
s_add_u32 s44, s20, 0x80000
|
|
s_addc_u32 s45, s21, 0
|
|
s_add_u32 s46, s20, 0xc0000
|
|
s_addc_u32 s47, s21, 0
|
|
s_add_u32 s48, s20, 0x100000
|
|
s_addc_u32 s49, s21, 0
|
|
s_add_u32 s50, s20, 0x140000
|
|
s_addc_u32 s51, s21, 0
|
|
s_add_u32 s52, s20, 0x180000
|
|
s_addc_u32 s53, s21, 0
|
|
s_add_u32 s54, s20, 0x1c0000
|
|
s_addc_u32 s55, s21, 0
|
|
|
|
; compute Matrix A offset
|
|
s_lshl_b32 s19, s15, 19 ; 4096 * 128 * blockIdx.y
|
|
v_lshrrev_b32_e32 v1, 3, v0 ; threadIdx.x / 8
|
|
v_lshlrev_b32_e32 v1, 12, v1 ; 4096 * (threadIdx.x/8)
|
|
v_and_b32_e32 v215, 7, v0 ; threadIdx.x % 8
|
|
v_add_u32_e32 v215, v1, v215 ; index = 4096*(threadIdx.x/8) + threadIdx.x % 8
|
|
v_add_nc_u32_e32 v215, s19, v215 ; index += 4096*128*blockIdx.y
|
|
v_lshlrev_b32_e32 v215,2, v215 ; offset = 4*index
|
|
|
|
|
|
|
|
|
|
;; Init code for matrix A and B buffer Loads - END
|
|
|
|
|
|
s_clause 0x1
|
|
;s_load_b128 s[4:7], s[0:1], 0x18 ; N, alpha, beta, ???
|
|
s_load_b128 s[8:11], s[0:1], 0x8 ; Matrix A and B
|
|
|
|
s_mov_b32 s4, 4096 ; hardcode 4096
|
|
s_mov_b32 s5, 0x3f800000 ; alpha
|
|
s_mov_b32 s6, 0 ; beta
|
|
s_mov_b32 s7, 0
|
|
|
|
s_lshl_b32 s2, s14, 7
|
|
v_lshrrev_b32_e32 v4, 3, v0
|
|
v_or_b32_e32 v1, s2, v0
|
|
s_lshl_b32 s3, s15, 7
|
|
v_and_b32_e32 v118, 7, v0
|
|
s_bfe_i32 s12, s15, 0x10018
|
|
v_or_b32_e32 v22, s3, v4
|
|
v_ashrrev_i32_e32 v2, 31, v1
|
|
s_lshr_b32 s12, s12, 25
|
|
s_load_b64 s[0:1], s[0:1], 0 ; Matrix C
|
|
v_lshlrev_b32_e32 v135, 2, v118
|
|
s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_3)
|
|
v_lshlrev_b64 v[5:6], 2, v[1:2]
|
|
s_waitcnt lgkmcnt(0)
|
|
v_add_nc_u32_e32 v3, s4, v1
|
|
v_mul_lo_u32 v119, v22, s4
|
|
v_add_co_u32 v5, vcc_lo, s10, v5
|
|
s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
v_add_co_ci_u32_e32 v6, vcc_lo, s11, v6, vcc_lo
|
|
v_add_nc_u32_e32 v7, s4, v3
|
|
v_ashrrev_i32_e32 v4, 31, v3
|
|
s_lshl_b32 s7, s4, 4
|
|
s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
v_add_nc_u32_e32 v125, s7, v119
|
|
v_add_nc_u32_e32 v2, s4, v7
|
|
v_ashrrev_i32_e32 v8, 31, v7
|
|
v_lshlrev_b64 v[9:10], 2, v[3:4]
|
|
s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
v_add_nc_u32_e32 v130, s7, v125
|
|
v_add_nc_u32_e32 v11, s4, v2
|
|
v_ashrrev_i32_e32 v3, 31, v2
|
|
v_lshlrev_b64 v[7:8], 2, v[7:8]
|
|
v_add_co_u32 v9, vcc_lo, s10, v9
|
|
s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
|
v_add_nc_u32_e32 v13, s4, v11
|
|
v_ashrrev_i32_e32 v12, 31, v11
|
|
v_lshlrev_b64 v[2:3], 2, v[2:3]
|
|
v_add_co_ci_u32_e32 v10, vcc_lo, s11, v10, vcc_lo
|
|
v_ashrrev_i32_e32 v14, 31, v13
|
|
v_add_co_u32 v7, vcc_lo, s10, v7
|
|
v_lshlrev_b64 v[11:12], 2, v[11:12]
|
|
v_add_co_ci_u32_e32 v8, vcc_lo, s11, v8, vcc_lo
|
|
v_add_nc_u32_e32 v15, s4, v13
|
|
v_add_co_u32 v2, vcc_lo, s10, v2
|
|
v_lshlrev_b64 v[13:14], 2, v[13:14]
|
|
v_add_co_ci_u32_e32 v3, vcc_lo, s11, v3, vcc_lo
|
|
v_add_co_u32 v11, vcc_lo, s10, v11
|
|
v_add_nc_u32_e32 v4, s4, v15
|
|
v_add_co_ci_u32_e32 v12, vcc_lo, s11, v12, vcc_lo
|
|
v_add_co_u32 v13, vcc_lo, s10, v13
|
|
v_ashrrev_i32_e32 v16, 31, v15
|
|
v_add_nc_u32_e32 v134, s7, v130
|
|
v_add_co_ci_u32_e32 v14, vcc_lo, s11, v14, vcc_lo
|
|
s_clause 0x5
|
|
global_load_b32 v23, v[5:6], off
|
|
global_load_b32 v24, v[9:10], off
|
|
global_load_b32 v25, v[7:8], off
|
|
global_load_b32 v26, v[2:3], off
|
|
global_load_b32 v27, v[11:12], off
|
|
global_load_b32 v28, v[13:14], off
|
|
v_add_nc_u32_e32 v6, v119, v118
|
|
v_ashrrev_i32_e32 v5, 31, v4
|
|
v_add_nc_u32_e32 v8, v125, v118
|
|
v_lshlrev_b64 v[2:3], 2, v[15:16]
|
|
v_add_nc_u32_e32 v137, s7, v134
|
|
v_ashrrev_i32_e32 v7, 31, v6
|
|
v_add_nc_u32_e32 v10, v130, v118
|
|
v_lshlrev_b64 v[4:5], 2, v[4:5]
|
|
v_ashrrev_i32_e32 v9, 31, v8
|
|
v_add_nc_u32_e32 v12, v134, v118
|
|
v_add_nc_u32_e32 v138, s7, v137
|
|
v_add_co_u32 v2, vcc_lo, s10, v2
|
|
v_lshlrev_b64 v[6:7], 2, v[6:7]
|
|
v_ashrrev_i32_e32 v11, 31, v10
|
|
v_add_co_ci_u32_e32 v3, vcc_lo, s11, v3, vcc_lo
|
|
v_add_nc_u32_e32 v14, v137, v118
|
|
v_add_co_u32 v4, vcc_lo, s10, v4
|
|
v_lshlrev_b64 v[8:9], 2, v[8:9]
|
|
v_ashrrev_i32_e32 v13, 31, v12
|
|
v_add_nc_u32_e32 v139, s7, v138
|
|
v_add_co_ci_u32_e32 v5, vcc_lo, s11, v5, vcc_lo
|
|
v_add_co_u32 v6, vcc_lo, s8, v6
|
|
v_lshlrev_b64 v[10:11], 2, v[10:11]
|
|
v_ashrrev_i32_e32 v15, 31, v14
|
|
v_add_co_ci_u32_e32 v7, vcc_lo, s9, v7, vcc_lo
|
|
v_add_nc_u32_e32 v16, v138, v118
|
|
v_add_co_u32 v8, vcc_lo, s8, v8
|
|
v_lshlrev_b64 v[12:13], 2, v[12:13]
|
|
v_add_nc_u32_e32 v140, s7, v139
|
|
v_add_co_ci_u32_e32 v9, vcc_lo, s9, v9, vcc_lo
|
|
v_add_nc_u32_e32 v18, v139, v118
|
|
v_add_co_u32 v10, vcc_lo, s8, v10
|
|
v_lshlrev_b64 v[14:15], 2, v[14:15]
|
|
v_ashrrev_i32_e32 v17, 31, v16
|
|
v_add_co_ci_u32_e32 v11, vcc_lo, s9, v11, vcc_lo
|
|
v_add_nc_u32_e32 v20, v140, v118
|
|
v_add_co_u32 v12, vcc_lo, s8, v12
|
|
v_ashrrev_i32_e32 v19, 31, v18
|
|
v_add_co_ci_u32_e32 v13, vcc_lo, s9, v13, vcc_lo
|
|
v_add_co_u32 v14, vcc_lo, s8, v14
|
|
v_lshlrev_b64 v[16:17], 2, v[16:17]
|
|
v_ashrrev_i32_e32 v21, 31, v20
|
|
v_add_co_ci_u32_e32 v15, vcc_lo, s9, v15, vcc_lo
|
|
s_clause 0x4
|
|
global_load_b32 v29, v[6:7], off
|
|
global_load_b32 v30, v[8:9], off
|
|
global_load_b32 v31, v[10:11], off
|
|
global_load_b32 v12, v[12:13], off
|
|
global_load_b32 v13, v[14:15], off
|
|
v_lshlrev_b64 v[6:7], 2, v[18:19]
|
|
v_add_co_u32 v8, vcc_lo, s8, v16
|
|
v_lshlrev_b64 v[10:11], 2, v[20:21]
|
|
v_add_co_ci_u32_e32 v9, vcc_lo, s9, v17, vcc_lo
|
|
s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
|
v_add_co_u32 v6, vcc_lo, s8, v6
|
|
v_add_co_ci_u32_e32 v7, vcc_lo, s9, v7, vcc_lo
|
|
v_add_co_u32 v10, vcc_lo, s8, v10
|
|
v_add_co_ci_u32_e32 v11, vcc_lo, s9, v11, vcc_lo
|
|
s_clause 0x1
|
|
global_load_b32 v3, v[2:3], off
|
|
global_load_b32 v4, v[4:5], off
|
|
s_clause 0x2
|
|
global_load_b32 v5, v[8:9], off
|
|
global_load_b32 v6, v[6:7], off
|
|
global_load_b32 v7, v[10:11], off
|
|
v_add_nc_u32_e32 v9, s12, v22
|
|
v_or_b32_e32 v10, 16, v22
|
|
v_or_b32_e32 v11, 32, v22
|
|
v_or_b32_e32 v14, 48, v22
|
|
v_or_b32_e32 v15, 64, v22
|
|
v_or_b32_e32 v16, 0x50, v22
|
|
v_or_b32_e32 v17, 0x60, v22
|
|
v_or_b32_e32 v18, 0x70, v22
|
|
s_bfe_i32 s7, s14, 0x10018
|
|
v_and_b32_e32 v9, 0x3fffff80, v9
|
|
s_lshr_b32 s7, s7, 25
|
|
v_add_nc_u32_e32 v19, s12, v10
|
|
v_add_nc_u32_e32 v8, s7, v1
|
|
v_add_nc_u32_e32 v20, s12, v11
|
|
v_add_nc_u32_e32 v21, s12, v14
|
|
v_add_nc_u32_e32 v32, s12, v15
|
|
v_add_nc_u32_e32 v33, s12, v16
|
|
v_add_nc_u32_e32 v34, s12, v17
|
|
v_add_nc_u32_e32 v35, s12, v18
|
|
v_and_b32_e32 v8, 0x3fffff80, v8
|
|
v_sub_nc_u32_e32 v9, v22, v9
|
|
v_and_b32_e32 v19, 0x3fffff80, v19
|
|
v_and_b32_e32 v20, 0x3fffff80, v20
|
|
v_and_b32_e32 v21, 0x3fffff80, v21
|
|
v_and_b32_e32 v22, 0x3fffff80, v32
|
|
v_and_b32_e32 v32, 0x3fffff80, v33
|
|
v_and_b32_e32 v33, 0x3fffff80, v34
|
|
v_and_b32_e32 v34, 0x3fffff80, v35
|
|
v_sub_nc_u32_e32 v8, v1, v8
|
|
v_lshlrev_b32_e32 v9, 2, v9
|
|
v_sub_nc_u32_e32 v10, v10, v19
|
|
v_sub_nc_u32_e32 v11, v11, v20
|
|
v_sub_nc_u32_e32 v14, v14, v21
|
|
v_sub_nc_u32_e32 v15, v15, v22
|
|
v_sub_nc_u32_e32 v16, v16, v32
|
|
v_sub_nc_u32_e32 v17, v17, v33
|
|
v_sub_nc_u32_e32 v18, v18, v34
|
|
v_bfe_u32 v2, v0, 3, 2
|
|
v_lshlrev_b32_e32 v8, 2, v8
|
|
v_mad_u32_u24 v141, 0x210, v118, v9
|
|
v_lshlrev_b32_e32 v9, 2, v10
|
|
v_lshlrev_b32_e32 v10, 2, v11
|
|
v_lshlrev_b32_e32 v11, 2, v14
|
|
v_lshlrev_b32_e32 v14, 2, v15
|
|
v_lshlrev_b32_e32 v15, 2, v16
|
|
v_lshlrev_b32_e32 v16, 2, v17
|
|
v_lshlrev_b32_e32 v17, 2, v18
|
|
v_lshlrev_b32_e32 v136, 2, v2
|
|
v_add_nc_u32_e32 v8, 0x80, v8
|
|
v_mad_u32_u24 v142, 0x210, v118, v9
|
|
v_mad_u32_u24 v143, 0x210, v118, v10
|
|
v_mad_u32_u24 v144, 0x210, v118, v11
|
|
v_mad_u32_u24 v145, 0x210, v118, v14
|
|
v_mad_u32_u24 v146, 0x210, v118, v15
|
|
v_mad_u32_u24 v147, 0x210, v118, v16
|
|
v_mad_u32_u24 v148, 0x210, v118, v17
|
|
s_mov_b32 s7, 0
|
|
s_cmp_gt_i32 s4, 0
|
|
s_waitcnt vmcnt(14)
|
|
ds_store_2addr_stride64_b32 v8, v23, v24 offset0:16 offset1:18
|
|
s_waitcnt vmcnt(9)
|
|
ds_store_b32 v141, v29
|
|
ds_store_2addr_stride64_b32 v8, v25, v26 offset0:20 offset1:22
|
|
s_waitcnt vmcnt(8)
|
|
ds_store_b32 v142, v30
|
|
s_waitcnt vmcnt(7)
|
|
ds_store_b32 v143, v31
|
|
ds_store_2addr_stride64_b32 v8, v27, v28 offset0:24 offset1:26
|
|
s_waitcnt vmcnt(6)
|
|
ds_store_b32 v144, v12
|
|
s_waitcnt vmcnt(5)
|
|
ds_store_b32 v145, v13
|
|
s_waitcnt vmcnt(3)
|
|
ds_store_2addr_stride64_b32 v8, v3, v4 offset0:28 offset1:30
|
|
s_waitcnt vmcnt(2)
|
|
ds_store_b32 v146, v5
|
|
s_waitcnt vmcnt(1)
|
|
ds_store_b32 v147, v6
|
|
s_waitcnt vmcnt(0)
|
|
ds_store_b32 v148, v7
|
|
s_waitcnt lgkmcnt(0)
|
|
s_barrier
|
|
s_cbranch_scc1 .LBB0_3
|
|
; %bb.1: ; %.preheader193..preheader180_crit_edge
|
|
v_lshlrev_b32_e32 v149, 2, v118
|
|
v_lshlrev_b32_e32 v150, 2, v2
|
|
s_mov_b32 s12, 0
|
|
s_and_not1_b32 vcc_lo, exec_lo, s7
|
|
s_cbranch_vccz .LBB0_4
|
|
; %bb.2:
|
|
v_dual_mov_b32 v2, s12 :: v_dual_mov_b32 v3, s12
|
|
v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s12
|
|
v_dual_mov_b32 v18, s12 :: v_dual_mov_b32 v19, s12
|
|
v_dual_mov_b32 v20, s12 :: v_dual_mov_b32 v21, s12
|
|
v_dual_mov_b32 v34, s12 :: v_dual_mov_b32 v35, s12
|
|
v_dual_mov_b32 v36, s12 :: v_dual_mov_b32 v37, s12
|
|
v_dual_mov_b32 v50, s12 :: v_dual_mov_b32 v51, s12
|
|
v_dual_mov_b32 v52, s12 :: v_dual_mov_b32 v53, s12
|
|
v_dual_mov_b32 v6, s12 :: v_dual_mov_b32 v7, s12
|
|
v_dual_mov_b32 v8, s12 :: v_dual_mov_b32 v9, s12
|
|
v_dual_mov_b32 v22, s12 :: v_dual_mov_b32 v23, s12
|
|
v_dual_mov_b32 v24, s12 :: v_dual_mov_b32 v25, s12
|
|
v_dual_mov_b32 v38, s12 :: v_dual_mov_b32 v39, s12
|
|
v_dual_mov_b32 v40, s12 :: v_dual_mov_b32 v41, s12
|
|
v_dual_mov_b32 v54, s12 :: v_dual_mov_b32 v55, s12
|
|
v_dual_mov_b32 v56, s12 :: v_dual_mov_b32 v57, s12
|
|
v_dual_mov_b32 v10, s12 :: v_dual_mov_b32 v11, s12
|
|
v_dual_mov_b32 v12, s12 :: v_dual_mov_b32 v13, s12
|
|
v_dual_mov_b32 v26, s12 :: v_dual_mov_b32 v27, s12
|
|
v_dual_mov_b32 v28, s12 :: v_dual_mov_b32 v29, s12
|
|
v_dual_mov_b32 v42, s12 :: v_dual_mov_b32 v43, s12
|
|
v_dual_mov_b32 v44, s12 :: v_dual_mov_b32 v45, s12
|
|
v_dual_mov_b32 v58, s12 :: v_dual_mov_b32 v59, s12
|
|
v_dual_mov_b32 v60, s12 :: v_dual_mov_b32 v61, s12
|
|
v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v15, s12
|
|
v_dual_mov_b32 v16, s12 :: v_dual_mov_b32 v17, s12
|
|
v_dual_mov_b32 v30, s12 :: v_dual_mov_b32 v31, s12
|
|
v_dual_mov_b32 v32, s12 :: v_dual_mov_b32 v33, s12
|
|
v_dual_mov_b32 v46, s12 :: v_dual_mov_b32 v47, s12
|
|
v_dual_mov_b32 v48, s12 :: v_dual_mov_b32 v49, s12
|
|
v_dual_mov_b32 v62, s12 :: v_dual_mov_b32 v63, s12
|
|
v_dual_mov_b32 v64, s12 :: v_dual_mov_b32 v65, s12
|
|
v_dual_mov_b32 v66, s12 :: v_dual_mov_b32 v67, s12
|
|
v_dual_mov_b32 v68, s12 :: v_dual_mov_b32 v69, s12
|
|
v_dual_mov_b32 v82, s12 :: v_dual_mov_b32 v83, s12
|
|
v_dual_mov_b32 v84, s12 :: v_dual_mov_b32 v85, s12
|
|
v_dual_mov_b32 v98, s12 :: v_dual_mov_b32 v99, s12
|
|
v_dual_mov_b32 v100, s12 :: v_dual_mov_b32 v101, s12
|
|
v_dual_mov_b32 v114, s12 :: v_dual_mov_b32 v115, s12
|
|
v_dual_mov_b32 v116, s12 :: v_dual_mov_b32 v117, s12
|
|
v_dual_mov_b32 v70, s12 :: v_dual_mov_b32 v71, s12
|
|
v_dual_mov_b32 v72, s12 :: v_dual_mov_b32 v73, s12
|
|
v_dual_mov_b32 v86, s12 :: v_dual_mov_b32 v87, s12
|
|
v_dual_mov_b32 v88, s12 :: v_dual_mov_b32 v89, s12
|
|
v_dual_mov_b32 v102, s12 :: v_dual_mov_b32 v103, s12
|
|
v_dual_mov_b32 v104, s12 :: v_dual_mov_b32 v105, s12
|
|
v_dual_mov_b32 v120, s12 :: v_dual_mov_b32 v121, s12
|
|
v_dual_mov_b32 v122, s12 :: v_dual_mov_b32 v123, s12
|
|
v_dual_mov_b32 v74, s12 :: v_dual_mov_b32 v75, s12
|
|
v_dual_mov_b32 v76, s12 :: v_dual_mov_b32 v77, s12
|
|
v_dual_mov_b32 v90, s12 :: v_dual_mov_b32 v91, s12
|
|
v_dual_mov_b32 v92, s12 :: v_dual_mov_b32 v93, s12
|
|
v_dual_mov_b32 v106, s12 :: v_dual_mov_b32 v107, s12
|
|
v_dual_mov_b32 v108, s12 :: v_dual_mov_b32 v109, s12
|
|
v_dual_mov_b32 v126, s12 :: v_dual_mov_b32 v127, s12
|
|
v_dual_mov_b32 v128, s12 :: v_dual_mov_b32 v129, s12
|
|
v_dual_mov_b32 v78, s12 :: v_dual_mov_b32 v79, s12
|
|
v_dual_mov_b32 v80, s12 :: v_dual_mov_b32 v81, s12
|
|
v_dual_mov_b32 v94, s12 :: v_dual_mov_b32 v95, s12
|
|
v_dual_mov_b32 v96, s12 :: v_dual_mov_b32 v97, s12
|
|
v_dual_mov_b32 v110, s12 :: v_dual_mov_b32 v111, s12
|
|
v_dual_mov_b32 v112, s12 :: v_dual_mov_b32 v113, s12
|
|
v_dual_mov_b32 v131, s12 :: v_dual_mov_b32 v132, s12
|
|
v_dual_mov_b32 v133, s12 :: v_dual_mov_b32 v124, s12
|
|
s_branch .LBB0_13
|
|
.LBB0_3:
|
|
s_mov_b32 s7, -1
|
|
; implicit-def: $sgpr12
|
|
; implicit-def: $vgpr149
|
|
; implicit-def: $vgpr150
|
|
.LBB0_4: ; %.lr.ph
|
|
s_ashr_i32 s7, s2, 31
|
|
v_dual_mov_b32 v133, 0 :: v_dual_lshlrev_b32 v2, 4, v2
|
|
s_lshr_b32 s7, s7, 25
|
|
s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
|
|
v_dual_mov_b32 v124, 0 :: v_dual_add_nc_u32 v3, s7, v1
|
|
v_ashrrev_i32_e32 v149, 31, v119
|
|
v_ashrrev_i32_e32 v150, 31, v125
|
|
v_ashrrev_i32_e32 v151, 31, v130
|
|
v_dual_mov_b32 v132, 0 :: v_dual_and_b32 v3, 0x3fffff80, v3
|
|
v_ashrrev_i32_e32 v152, 31, v134
|
|
v_ashrrev_i32_e32 v153, 31, v137
|
|
v_ashrrev_i32_e32 v154, 31, v138
|
|
v_ashrrev_i32_e32 v156, 31, v139
|
|
v_sub_nc_u32_e32 v3, v1, v3
|
|
v_ashrrev_i32_e32 v157, 31, v140
|
|
v_lshl_or_b32 v166, v118, 4, 0x1080
|
|
v_dual_mov_b32 v131, 0 :: v_dual_mov_b32 v110, 0
|
|
s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
|
v_lshl_add_u32 v155, v3, 2, 0x1080
|
|
v_dual_mov_b32 v112, 0 :: v_dual_lshlrev_b32 v3, 2, v0
|
|
v_dual_mov_b32 v113, 0 :: v_dual_mov_b32 v96, 0
|
|
v_lshl_add_u32 v158, 1, 9, v155
|
|
v_lshl_add_u32 v159, 2, 9, v155
|
|
v_lshl_add_u32 v160, 3, 9, v155
|
|
v_lshl_add_u32 v161, 4, 9, v155
|
|
v_lshl_add_u32 v162, 5, 9, v155
|
|
v_lshl_add_u32 v163, 6, 9, v155
|
|
v_lshl_add_u32 v164, 7, 9, v155
|
|
v_and_or_b32 v165, 0x180, v3, v2
|
|
v_dual_mov_b32 v111, 0 :: v_dual_mov_b32 v94, 0
|
|
v_dual_mov_b32 v97, 0 :: v_dual_mov_b32 v80, 0
|
|
v_dual_mov_b32 v95, 0 :: v_dual_mov_b32 v78, 0
|
|
v_dual_mov_b32 v81, 0 :: v_dual_mov_b32 v128, 0
|
|
v_dual_mov_b32 v79, 0 :: v_dual_mov_b32 v126, 0
|
|
v_dual_mov_b32 v129, 0 :: v_dual_mov_b32 v108, 0
|
|
v_dual_mov_b32 v127, 0 :: v_dual_mov_b32 v106, 0
|
|
v_dual_mov_b32 v109, 0 :: v_dual_mov_b32 v92, 0
|
|
v_dual_mov_b32 v107, 0 :: v_dual_mov_b32 v90, 0
|
|
v_dual_mov_b32 v93, 0 :: v_dual_mov_b32 v76, 0
|
|
v_dual_mov_b32 v91, 0 :: v_dual_mov_b32 v74, 0
|
|
v_dual_mov_b32 v77, 0 :: v_dual_mov_b32 v122, 0
|
|
v_dual_mov_b32 v75, 0 :: v_dual_mov_b32 v120, 0
|
|
v_dual_mov_b32 v123, 0 :: v_dual_mov_b32 v104, 0
|
|
v_dual_mov_b32 v121, 0 :: v_dual_mov_b32 v102, 0
|
|
v_dual_mov_b32 v105, 0 :: v_dual_mov_b32 v88, 0
|
|
v_dual_mov_b32 v103, 0 :: v_dual_mov_b32 v86, 0
|
|
v_dual_mov_b32 v89, 0 :: v_dual_mov_b32 v72, 0
|
|
v_dual_mov_b32 v87, 0 :: v_dual_mov_b32 v70, 0
|
|
v_dual_mov_b32 v73, 0 :: v_dual_mov_b32 v116, 0
|
|
v_dual_mov_b32 v71, 0 :: v_dual_mov_b32 v114, 0
|
|
v_dual_mov_b32 v117, 0 :: v_dual_mov_b32 v100, 0
|
|
v_dual_mov_b32 v115, 0 :: v_dual_mov_b32 v98, 0
|
|
v_dual_mov_b32 v101, 0 :: v_dual_mov_b32 v84, 0
|
|
v_dual_mov_b32 v99, 0 :: v_dual_mov_b32 v82, 0
|
|
v_dual_mov_b32 v85, 0 :: v_dual_mov_b32 v68, 0
|
|
v_dual_mov_b32 v83, 0 :: v_dual_mov_b32 v66, 0
|
|
v_dual_mov_b32 v69, 0 :: v_dual_mov_b32 v64, 0
|
|
v_dual_mov_b32 v67, 0 :: v_dual_mov_b32 v62, 0
|
|
v_dual_mov_b32 v65, 0 :: v_dual_mov_b32 v48, 0
|
|
v_dual_mov_b32 v63, 0 :: v_dual_mov_b32 v46, 0
|
|
v_dual_mov_b32 v49, 0 :: v_dual_mov_b32 v32, 0
|
|
v_dual_mov_b32 v47, 0 :: v_dual_mov_b32 v30, 0
|
|
v_dual_mov_b32 v33, 0 :: v_dual_mov_b32 v16, 0
|
|
v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v14, 0
|
|
v_dual_mov_b32 v17, 0 :: v_dual_mov_b32 v60, 0
|
|
v_dual_mov_b32 v15, 0 :: v_dual_mov_b32 v58, 0
|
|
v_dual_mov_b32 v61, 0 :: v_dual_mov_b32 v44, 0
|
|
v_dual_mov_b32 v59, 0 :: v_dual_mov_b32 v42, 0
|
|
v_dual_mov_b32 v45, 0 :: v_dual_mov_b32 v28, 0
|
|
v_dual_mov_b32 v43, 0 :: v_dual_mov_b32 v26, 0
|
|
v_dual_mov_b32 v29, 0 :: v_dual_mov_b32 v12, 0
|
|
v_dual_mov_b32 v27, 0 :: v_dual_mov_b32 v10, 0
|
|
v_dual_mov_b32 v13, 0 :: v_dual_mov_b32 v56, 0
|
|
v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v54, 0
|
|
v_dual_mov_b32 v57, 0 :: v_dual_mov_b32 v40, 0
|
|
v_dual_mov_b32 v55, 0 :: v_dual_mov_b32 v38, 0
|
|
v_dual_mov_b32 v41, 0 :: v_dual_mov_b32 v24, 0
|
|
v_dual_mov_b32 v39, 0 :: v_dual_mov_b32 v22, 0
|
|
v_dual_mov_b32 v25, 0 :: v_dual_mov_b32 v8, 0
|
|
v_dual_mov_b32 v23, 0 :: v_dual_mov_b32 v6, 0
|
|
v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v52, 0
|
|
v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v50, 0
|
|
v_dual_mov_b32 v53, 0 :: v_dual_mov_b32 v36, 0
|
|
v_dual_mov_b32 v51, 0 :: v_dual_mov_b32 v34, 0
|
|
v_dual_mov_b32 v37, 0 :: v_dual_mov_b32 v20, 0
|
|
v_dual_mov_b32 v35, 0 :: v_dual_mov_b32 v18, 0
|
|
v_dual_mov_b32 v21, 0 :: v_dual_mov_b32 v4, 0
|
|
v_dual_mov_b32 v19, 0 :: v_dual_mov_b32 v2, 0
|
|
;setting v214 as well (extra bank2 vpgr used by output matrix C)
|
|
v_mov_b32 v214,0
|
|
|
|
v_mov_b32_e32 v5, 0
|
|
v_mov_b32_e32 v3, 0
|
|
s_add_i32 s7, s4, -1
|
|
s_add_u32 s8, s8, 32
|
|
s_addc_u32 s9, s9, 0
|
|
s_mov_b32 s12, 0
|
|
; implicit-def: $vgpr175
|
|
; implicit-def: $vgpr176
|
|
; implicit-def: $vgpr177
|
|
; implicit-def: $vgpr178
|
|
; implicit-def: $vgpr179
|
|
; implicit-def: $vgpr180
|
|
; implicit-def: $vgpr181
|
|
; implicit-def: $vgpr182
|
|
; implicit-def: $vgpr167
|
|
; implicit-def: $vgpr168
|
|
; implicit-def: $vgpr169
|
|
; implicit-def: $vgpr170
|
|
; implicit-def: $vgpr171
|
|
; implicit-def: $vgpr172
|
|
; implicit-def: $vgpr173
|
|
; implicit-def: $vgpr174
|
|
s_branch .LBB0_6
|
|
.LBB0_5: ; in Loop: Header=BB0_6 Depth=1
|
|
s_add_i32 s12, s12, 8
|
|
s_delay_alu instid0(SALU_CYCLE_1)
|
|
s_cmp_ge_i32 s12, s4
|
|
s_cbranch_scc1 .LBB0_12
|
|
.LBB0_6: ; =>This Loop Header: Depth=1
|
|
; Child Loop BB0_9 Depth 2
|
|
s_cmp_lt_i32 s12, s7
|
|
s_cselect_b32 s13, -1, 0
|
|
s_cmp_ge_i32 s12, s7
|
|
s_cbranch_scc1 .LBB0_8
|
|
; %bb.7: ; %.preheader192
|
|
; in Loop: Header=BB0_6 Depth=1
|
|
|
|
; Global memory read for matrix B
|
|
v_add_nc_u32_e32 v203, 0x20000, v203
|
|
v_add_nc_u32_e32 v215, 0x20, v215
|
|
s_setprio 0
|
|
global_load_b32 v167, v203, s[24:25]
|
|
global_load_b32 v168, v203, s[26:27]
|
|
global_load_b32 v169, v203, s[28:29]
|
|
global_load_b32 v170, v203, s[30:31]
|
|
|
|
|
|
|
|
|
|
.LBB0_8: ; %.loopexit
|
|
; in Loop: Header=BB0_6 Depth=1
|
|
v_mov_b32_e32 v183, v165
|
|
s_mov_b32 s14, 0
|
|
|
|
v_mov_b32 v202,v166
|
|
.LBB0_9: ; %.preheader188
|
|
; Parent Loop BB0_6 Depth=1
|
|
; => This Inner Loop Header: Depth=2
|
|
|
|
|
|
s_clause 0xB;
|
|
;A on bank 2-3
|
|
ds_load_b64 v[186:187], v183
|
|
ds_load_b64 v[190:191], v183 offset: 8
|
|
ds_load_b64 v[194:195], v183 offset: 64
|
|
ds_load_b64 v[198:199], v183 offset: 72
|
|
|
|
;B on bank 0-1
|
|
ds_load_b64 v[184:185], v202
|
|
ds_load_b64 v[188:189], v202 offset: 8
|
|
ds_load_b64 v[192:193], v202 offset: 128
|
|
ds_load_b64 v[196:197], v202 offset: 136
|
|
ds_load_b64 v[200:201], v202 offset: 256
|
|
ds_load_b64 v[204:205], v202 offset: 264
|
|
ds_load_b64 v[208:209], v202 offset: 384
|
|
ds_load_b64 v[212:213], v202 offset: 392
|
|
|
|
v_add_nc_u32_e32 v183, 0x210, v183
|
|
v_add_nc_u32_e32 v202, 0x200, v202
|
|
s_waitcnt lgkmcnt(0)
|
|
|
|
; ; new vpgrs allocation
|
|
|
|
v_dual_fmac_f32 v5, v186, v184 :: v_dual_fmac_f32 v2, v187, v185
|
|
s_setprio 1
|
|
v_dual_fmac_f32 v3, v186, v185 :: v_dual_fmac_f32 v4, v187, v184
|
|
v_dual_fmac_f32 v9, v186, v188 :: v_dual_fmac_f32 v6, v187, v189
|
|
v_dual_fmac_f32 v7, v187, v188 :: v_dual_fmac_f32 v8, v186, v189
|
|
v_dual_fmac_f32 v13, v190, v188 :: v_dual_fmac_f32 v10, v191, v189
|
|
v_dual_fmac_f32 v11, v190, v189 :: v_dual_fmac_f32 v12, v191, v188
|
|
v_dual_fmac_f32 v17, v190, v184 :: v_dual_fmac_f32 v14, v191, v185
|
|
v_dual_fmac_f32 v15, v191, v184 :: v_dual_fmac_f32 v16, v190, v185
|
|
v_dual_fmac_f32 v21, v194, v184 :: v_dual_fmac_f32 v18, v195, v185
|
|
v_dual_fmac_f32 v19, v194, v185 :: v_dual_fmac_f32 v20, v195, v184
|
|
v_dual_fmac_f32 v25, v194, v188 :: v_dual_fmac_f32 v22, v195, v189
|
|
v_dual_fmac_f32 v23, v195, v188 :: v_dual_fmac_f32 v24, v194, v189
|
|
v_dual_fmac_f32 v29, v198, v188 :: v_dual_fmac_f32 v26, v199, v189
|
|
v_dual_fmac_f32 v27, v198, v189 :: v_dual_fmac_f32 v28, v199, v188
|
|
v_dual_fmac_f32 v33, v198, v192 :: v_dual_fmac_f32 v30, v199, v193
|
|
v_dual_fmac_f32 v31, v199, v192 :: v_dual_fmac_f32 v32, v198, v193
|
|
v_dual_fmac_f32 v37, v186, v192 :: v_dual_fmac_f32 v34, v187, v193
|
|
v_dual_fmac_f32 v35, v186, v193 :: v_dual_fmac_f32 v36, v187, v192
|
|
v_dual_fmac_f32 v41, v186, v196 :: v_dual_fmac_f32 v38, v187, v197
|
|
v_dual_fmac_f32 v39, v187, v196 :: v_dual_fmac_f32 v40, v186, v197
|
|
v_dual_fmac_f32 v45, v190, v196 :: v_dual_fmac_f32 v42, v191, v197
|
|
v_dual_fmac_f32 v43, v190, v197 :: v_dual_fmac_f32 v44, v191, v196
|
|
v_dual_fmac_f32 v49, v190, v192 :: v_dual_fmac_f32 v46, v191, v193
|
|
v_dual_fmac_f32 v47, v191, v192 :: v_dual_fmac_f32 v48, v190, v193
|
|
v_dual_fmac_f32 v53, v194, v192 :: v_dual_fmac_f32 v50, v195, v193
|
|
v_dual_fmac_f32 v51, v194, v193 :: v_dual_fmac_f32 v52, v195, v192
|
|
v_dual_fmac_f32 v57, v194, v196 :: v_dual_fmac_f32 v54, v195, v197
|
|
v_dual_fmac_f32 v55, v195, v196 :: v_dual_fmac_f32 v56, v194, v197
|
|
v_dual_fmac_f32 v61, v198, v196 :: v_dual_fmac_f32 v58, v199, v197
|
|
v_dual_fmac_f32 v59, v198, v197 :: v_dual_fmac_f32 v60, v199, v196
|
|
v_dual_fmac_f32 v65, v198, v200 :: v_dual_fmac_f32 v62, v199, v201
|
|
v_dual_fmac_f32 v63, v199, v200 :: v_dual_fmac_f32 v64, v198, v201
|
|
v_dual_fmac_f32 v69, v186, v200 :: v_dual_fmac_f32 v66, v187, v201
|
|
v_dual_fmac_f32 v67, v186, v201 :: v_dual_fmac_f32 v68, v187, v200
|
|
v_dual_fmac_f32 v73, v186, v204 :: v_dual_fmac_f32 v70, v187, v205
|
|
v_dual_fmac_f32 v71, v187, v204 :: v_dual_fmac_f32 v72, v186, v205
|
|
v_dual_fmac_f32 v77, v190, v204 :: v_dual_fmac_f32 v74, v191, v205
|
|
v_dual_fmac_f32 v75, v190, v205 :: v_dual_fmac_f32 v76, v191, v204
|
|
v_dual_fmac_f32 v81, v190, v200 :: v_dual_fmac_f32 v78, v191, v201
|
|
v_dual_fmac_f32 v79, v191, v200 :: v_dual_fmac_f32 v80, v190, v201
|
|
v_dual_fmac_f32 v85, v194, v200 :: v_dual_fmac_f32 v82, v195, v201
|
|
v_dual_fmac_f32 v83, v194, v201 :: v_dual_fmac_f32 v84, v195, v200
|
|
v_dual_fmac_f32 v89, v194, v204 :: v_dual_fmac_f32 v86, v195, v205
|
|
v_dual_fmac_f32 v87, v195, v204 :: v_dual_fmac_f32 v88, v194, v205
|
|
v_dual_fmac_f32 v93, v198, v204 :: v_dual_fmac_f32 v90, v199, v205
|
|
v_dual_fmac_f32 v91, v198, v205 :: v_dual_fmac_f32 v92, v199, v204
|
|
v_dual_fmac_f32 v97, v198, v208 :: v_dual_fmac_f32 v94, v199, v209
|
|
v_dual_fmac_f32 v95, v199, v208 :: v_dual_fmac_f32 v96, v198, v209
|
|
v_dual_fmac_f32 v101, v186, v208 :: v_dual_fmac_f32 v98, v187, v209
|
|
v_dual_fmac_f32 v99, v186, v209 :: v_dual_fmac_f32 v100, v187, v208
|
|
v_dual_fmac_f32 v105, v186, v212 :: v_dual_fmac_f32 v102, v187, v213
|
|
v_dual_fmac_f32 v103, v187, v212 :: v_dual_fmac_f32 v104, v186, v213
|
|
v_dual_fmac_f32 v109, v190, v212 :: v_dual_fmac_f32 v106, v191, v213
|
|
v_dual_fmac_f32 v107, v190, v213 :: v_dual_fmac_f32 v108, v191, v212
|
|
v_dual_fmac_f32 v113, v190, v208 :: v_dual_fmac_f32 v110, v191, v209
|
|
v_dual_fmac_f32 v111, v191, v208 :: v_dual_fmac_f32 v112, v190, v209
|
|
v_dual_fmac_f32 v117, v194, v208 :: v_dual_fmac_f32 v114, v195, v209
|
|
v_dual_fmac_f32 v115, v194, v209 :: v_dual_fmac_f32 v116, v195, v208
|
|
v_dual_fmac_f32 v121, v194, v212 :: v_dual_fmac_f32 v122, v195, v213
|
|
v_dual_fmac_f32 v123, v195, v212 :: v_dual_fmac_f32 v120, v194, v213
|
|
v_dual_fmac_f32 v129, v198, v212 :: v_dual_fmac_f32 v126, v199, v213
|
|
v_dual_fmac_f32 v127, v198, v213 :: v_dual_fmac_f32 v124, v199, v212
|
|
v_dual_fmac_f32 v133, v198, v184 :: v_dual_fmac_f32 v214, v199, v185
|
|
v_dual_fmac_f32 v131, v199, v184 :: v_dual_fmac_f32 v128, v198, v185
|
|
|
|
s_setprio 0
|
|
s_clause 0x1
|
|
global_load_b32 v171, v203, s[32:33]
|
|
global_load_b32 v172, v203, s[34:35]
|
|
|
|
s_clause 0xB;
|
|
;A on bank 2-3
|
|
ds_load_b64 v[186:187], v183
|
|
ds_load_b64 v[190:191], v183 offset: 8
|
|
ds_load_b64 v[194:195], v183 offset: 64
|
|
ds_load_b64 v[198:199], v183 offset: 72
|
|
|
|
|
|
;B on bank 0-1
|
|
ds_load_b64 v[184:185], v202
|
|
ds_load_b64 v[188:189], v202 offset: 8
|
|
ds_load_b64 v[192:193], v202 offset: 128
|
|
ds_load_b64 v[196:197], v202 offset: 136
|
|
ds_load_b64 v[200:201], v202 offset: 256
|
|
ds_load_b64 v[204:205], v202 offset: 264
|
|
ds_load_b64 v[208:209], v202 offset: 384
|
|
ds_load_b64 v[212:213], v202 offset: 392
|
|
|
|
v_add_nc_u32_e32 v183, 0x210, v183
|
|
v_add_nc_u32_e32 v202, 0x200, v202
|
|
;s_cmpk_lg_i32 s14, 0x1000
|
|
s_waitcnt lgkmcnt(0)
|
|
|
|
; ; new vpgrs allocation
|
|
|
|
v_dual_fmac_f32 v5, v186, v184 :: v_dual_fmac_f32 v2, v187, v185
|
|
s_setprio 1
|
|
v_dual_fmac_f32 v3, v186, v185 :: v_dual_fmac_f32 v4, v187, v184
|
|
v_dual_fmac_f32 v9, v186, v188 :: v_dual_fmac_f32 v6, v187, v189
|
|
v_dual_fmac_f32 v7, v187, v188 :: v_dual_fmac_f32 v8, v186, v189
|
|
v_dual_fmac_f32 v13, v190, v188 :: v_dual_fmac_f32 v10, v191, v189
|
|
v_dual_fmac_f32 v11, v190, v189 :: v_dual_fmac_f32 v12, v191, v188
|
|
v_dual_fmac_f32 v17, v190, v184 :: v_dual_fmac_f32 v14, v191, v185
|
|
v_dual_fmac_f32 v15, v191, v184 :: v_dual_fmac_f32 v16, v190, v185
|
|
v_dual_fmac_f32 v21, v194, v184 :: v_dual_fmac_f32 v18, v195, v185
|
|
v_dual_fmac_f32 v19, v194, v185 :: v_dual_fmac_f32 v20, v195, v184
|
|
v_dual_fmac_f32 v25, v194, v188 :: v_dual_fmac_f32 v22, v195, v189
|
|
v_dual_fmac_f32 v23, v195, v188 :: v_dual_fmac_f32 v24, v194, v189
|
|
v_dual_fmac_f32 v29, v198, v188 :: v_dual_fmac_f32 v26, v199, v189
|
|
v_dual_fmac_f32 v27, v198, v189 :: v_dual_fmac_f32 v28, v199, v188
|
|
v_dual_fmac_f32 v33, v198, v192 :: v_dual_fmac_f32 v30, v199, v193
|
|
v_dual_fmac_f32 v31, v199, v192 :: v_dual_fmac_f32 v32, v198, v193
|
|
v_dual_fmac_f32 v37, v186, v192 :: v_dual_fmac_f32 v34, v187, v193
|
|
v_dual_fmac_f32 v35, v186, v193 :: v_dual_fmac_f32 v36, v187, v192
|
|
v_dual_fmac_f32 v41, v186, v196 :: v_dual_fmac_f32 v38, v187, v197
|
|
v_dual_fmac_f32 v39, v187, v196 :: v_dual_fmac_f32 v40, v186, v197
|
|
v_dual_fmac_f32 v45, v190, v196 :: v_dual_fmac_f32 v42, v191, v197
|
|
v_dual_fmac_f32 v43, v190, v197 :: v_dual_fmac_f32 v44, v191, v196
|
|
v_dual_fmac_f32 v49, v190, v192 :: v_dual_fmac_f32 v46, v191, v193
|
|
v_dual_fmac_f32 v47, v191, v192 :: v_dual_fmac_f32 v48, v190, v193
|
|
v_dual_fmac_f32 v53, v194, v192 :: v_dual_fmac_f32 v50, v195, v193
|
|
v_dual_fmac_f32 v51, v194, v193 :: v_dual_fmac_f32 v52, v195, v192
|
|
v_dual_fmac_f32 v57, v194, v196 :: v_dual_fmac_f32 v54, v195, v197
|
|
v_dual_fmac_f32 v55, v195, v196 :: v_dual_fmac_f32 v56, v194, v197
|
|
v_dual_fmac_f32 v61, v198, v196 :: v_dual_fmac_f32 v58, v199, v197
|
|
v_dual_fmac_f32 v59, v198, v197 :: v_dual_fmac_f32 v60, v199, v196
|
|
v_dual_fmac_f32 v65, v198, v200 :: v_dual_fmac_f32 v62, v199, v201
|
|
v_dual_fmac_f32 v63, v199, v200 :: v_dual_fmac_f32 v64, v198, v201
|
|
v_dual_fmac_f32 v69, v186, v200 :: v_dual_fmac_f32 v66, v187, v201
|
|
v_dual_fmac_f32 v67, v186, v201 :: v_dual_fmac_f32 v68, v187, v200
|
|
v_dual_fmac_f32 v73, v186, v204 :: v_dual_fmac_f32 v70, v187, v205
|
|
v_dual_fmac_f32 v71, v187, v204 :: v_dual_fmac_f32 v72, v186, v205
|
|
v_dual_fmac_f32 v77, v190, v204 :: v_dual_fmac_f32 v74, v191, v205
|
|
v_dual_fmac_f32 v75, v190, v205 :: v_dual_fmac_f32 v76, v191, v204
|
|
v_dual_fmac_f32 v81, v190, v200 :: v_dual_fmac_f32 v78, v191, v201
|
|
v_dual_fmac_f32 v79, v191, v200 :: v_dual_fmac_f32 v80, v190, v201
|
|
v_dual_fmac_f32 v85, v194, v200 :: v_dual_fmac_f32 v82, v195, v201
|
|
v_dual_fmac_f32 v83, v194, v201 :: v_dual_fmac_f32 v84, v195, v200
|
|
v_dual_fmac_f32 v89, v194, v204 :: v_dual_fmac_f32 v86, v195, v205
|
|
v_dual_fmac_f32 v87, v195, v204 :: v_dual_fmac_f32 v88, v194, v205
|
|
v_dual_fmac_f32 v93, v198, v204 :: v_dual_fmac_f32 v90, v199, v205
|
|
v_dual_fmac_f32 v91, v198, v205 :: v_dual_fmac_f32 v92, v199, v204
|
|
v_dual_fmac_f32 v97, v198, v208 :: v_dual_fmac_f32 v94, v199, v209
|
|
v_dual_fmac_f32 v95, v199, v208 :: v_dual_fmac_f32 v96, v198, v209
|
|
v_dual_fmac_f32 v101, v186, v208 :: v_dual_fmac_f32 v98, v187, v209
|
|
v_dual_fmac_f32 v99, v186, v209 :: v_dual_fmac_f32 v100, v187, v208
|
|
v_dual_fmac_f32 v105, v186, v212 :: v_dual_fmac_f32 v102, v187, v213
|
|
v_dual_fmac_f32 v103, v187, v212 :: v_dual_fmac_f32 v104, v186, v213
|
|
v_dual_fmac_f32 v109, v190, v212 :: v_dual_fmac_f32 v106, v191, v213
|
|
v_dual_fmac_f32 v107, v190, v213 :: v_dual_fmac_f32 v108, v191, v212
|
|
v_dual_fmac_f32 v113, v190, v208 :: v_dual_fmac_f32 v110, v191, v209
|
|
v_dual_fmac_f32 v111, v191, v208 :: v_dual_fmac_f32 v112, v190, v209
|
|
v_dual_fmac_f32 v117, v194, v208 :: v_dual_fmac_f32 v114, v195, v209
|
|
v_dual_fmac_f32 v115, v194, v209 :: v_dual_fmac_f32 v116, v195, v208
|
|
v_dual_fmac_f32 v121, v194, v212 :: v_dual_fmac_f32 v122, v195, v213
|
|
v_dual_fmac_f32 v123, v195, v212 :: v_dual_fmac_f32 v120, v194, v213
|
|
v_dual_fmac_f32 v129, v198, v212 :: v_dual_fmac_f32 v126, v199, v213
|
|
v_dual_fmac_f32 v127, v198, v213 :: v_dual_fmac_f32 v124, v199, v212
|
|
v_dual_fmac_f32 v133, v198, v184 :: v_dual_fmac_f32 v214, v199, v185
|
|
v_dual_fmac_f32 v131, v199, v184 :: v_dual_fmac_f32 v128, v198, v185
|
|
|
|
s_setprio 0
|
|
|
|
|
|
s_clause 0x1
|
|
global_load_b32 v173, v203, s[36:37]
|
|
global_load_b32 v174, v203, s[38:39]
|
|
|
|
s_clause 0xB;
|
|
;A on bank 2-3
|
|
ds_load_b64 v[186:187], v183
|
|
ds_load_b64 v[190:191], v183 offset: 8
|
|
ds_load_b64 v[194:195], v183 offset: 64
|
|
ds_load_b64 v[198:199], v183 offset: 72
|
|
|
|
|
|
;B on bank 0-1
|
|
ds_load_b64 v[184:185], v202
|
|
ds_load_b64 v[188:189], v202 offset: 8
|
|
ds_load_b64 v[192:193], v202 offset: 128
|
|
ds_load_b64 v[196:197], v202 offset: 136
|
|
ds_load_b64 v[200:201], v202 offset: 256
|
|
ds_load_b64 v[204:205], v202 offset: 264
|
|
ds_load_b64 v[208:209], v202 offset: 384
|
|
ds_load_b64 v[212:213], v202 offset: 392
|
|
|
|
v_add_nc_u32_e32 v183, 0x210, v183
|
|
v_add_nc_u32_e32 v202, 0x200, v202
|
|
;s_cmpk_lg_i32 s14, 0x1000
|
|
s_waitcnt lgkmcnt(0)
|
|
|
|
; new vpgrs allocation
|
|
|
|
v_dual_fmac_f32 v5, v186, v184 :: v_dual_fmac_f32 v2, v187, v185
|
|
s_setprio 1
|
|
v_dual_fmac_f32 v3, v186, v185 :: v_dual_fmac_f32 v4, v187, v184
|
|
v_dual_fmac_f32 v9, v186, v188 :: v_dual_fmac_f32 v6, v187, v189
|
|
v_dual_fmac_f32 v7, v187, v188 :: v_dual_fmac_f32 v8, v186, v189
|
|
v_dual_fmac_f32 v13, v190, v188 :: v_dual_fmac_f32 v10, v191, v189
|
|
v_dual_fmac_f32 v11, v190, v189 :: v_dual_fmac_f32 v12, v191, v188
|
|
v_dual_fmac_f32 v17, v190, v184 :: v_dual_fmac_f32 v14, v191, v185
|
|
v_dual_fmac_f32 v15, v191, v184 :: v_dual_fmac_f32 v16, v190, v185
|
|
v_dual_fmac_f32 v21, v194, v184 :: v_dual_fmac_f32 v18, v195, v185
|
|
v_dual_fmac_f32 v19, v194, v185 :: v_dual_fmac_f32 v20, v195, v184
|
|
v_dual_fmac_f32 v25, v194, v188 :: v_dual_fmac_f32 v22, v195, v189
|
|
v_dual_fmac_f32 v23, v195, v188 :: v_dual_fmac_f32 v24, v194, v189
|
|
v_dual_fmac_f32 v29, v198, v188 :: v_dual_fmac_f32 v26, v199, v189
|
|
v_dual_fmac_f32 v27, v198, v189 :: v_dual_fmac_f32 v28, v199, v188
|
|
v_dual_fmac_f32 v33, v198, v192 :: v_dual_fmac_f32 v30, v199, v193
|
|
v_dual_fmac_f32 v31, v199, v192 :: v_dual_fmac_f32 v32, v198, v193
|
|
v_dual_fmac_f32 v37, v186, v192 :: v_dual_fmac_f32 v34, v187, v193
|
|
v_dual_fmac_f32 v35, v186, v193 :: v_dual_fmac_f32 v36, v187, v192
|
|
v_dual_fmac_f32 v41, v186, v196 :: v_dual_fmac_f32 v38, v187, v197
|
|
v_dual_fmac_f32 v39, v187, v196 :: v_dual_fmac_f32 v40, v186, v197
|
|
v_dual_fmac_f32 v45, v190, v196 :: v_dual_fmac_f32 v42, v191, v197
|
|
v_dual_fmac_f32 v43, v190, v197 :: v_dual_fmac_f32 v44, v191, v196
|
|
v_dual_fmac_f32 v49, v190, v192 :: v_dual_fmac_f32 v46, v191, v193
|
|
v_dual_fmac_f32 v47, v191, v192 :: v_dual_fmac_f32 v48, v190, v193
|
|
v_dual_fmac_f32 v53, v194, v192 :: v_dual_fmac_f32 v50, v195, v193
|
|
v_dual_fmac_f32 v51, v194, v193 :: v_dual_fmac_f32 v52, v195, v192
|
|
v_dual_fmac_f32 v57, v194, v196 :: v_dual_fmac_f32 v54, v195, v197
|
|
v_dual_fmac_f32 v55, v195, v196 :: v_dual_fmac_f32 v56, v194, v197
|
|
v_dual_fmac_f32 v61, v198, v196 :: v_dual_fmac_f32 v58, v199, v197
|
|
v_dual_fmac_f32 v59, v198, v197 :: v_dual_fmac_f32 v60, v199, v196
|
|
v_dual_fmac_f32 v65, v198, v200 :: v_dual_fmac_f32 v62, v199, v201
|
|
v_dual_fmac_f32 v63, v199, v200 :: v_dual_fmac_f32 v64, v198, v201
|
|
v_dual_fmac_f32 v69, v186, v200 :: v_dual_fmac_f32 v66, v187, v201
|
|
v_dual_fmac_f32 v67, v186, v201 :: v_dual_fmac_f32 v68, v187, v200
|
|
v_dual_fmac_f32 v73, v186, v204 :: v_dual_fmac_f32 v70, v187, v205
|
|
v_dual_fmac_f32 v71, v187, v204 :: v_dual_fmac_f32 v72, v186, v205
|
|
v_dual_fmac_f32 v77, v190, v204 :: v_dual_fmac_f32 v74, v191, v205
|
|
v_dual_fmac_f32 v75, v190, v205 :: v_dual_fmac_f32 v76, v191, v204
|
|
v_dual_fmac_f32 v81, v190, v200 :: v_dual_fmac_f32 v78, v191, v201
|
|
v_dual_fmac_f32 v79, v191, v200 :: v_dual_fmac_f32 v80, v190, v201
|
|
v_dual_fmac_f32 v85, v194, v200 :: v_dual_fmac_f32 v82, v195, v201
|
|
v_dual_fmac_f32 v83, v194, v201 :: v_dual_fmac_f32 v84, v195, v200
|
|
v_dual_fmac_f32 v89, v194, v204 :: v_dual_fmac_f32 v86, v195, v205
|
|
v_dual_fmac_f32 v87, v195, v204 :: v_dual_fmac_f32 v88, v194, v205
|
|
v_dual_fmac_f32 v93, v198, v204 :: v_dual_fmac_f32 v90, v199, v205
|
|
v_dual_fmac_f32 v91, v198, v205 :: v_dual_fmac_f32 v92, v199, v204
|
|
v_dual_fmac_f32 v97, v198, v208 :: v_dual_fmac_f32 v94, v199, v209
|
|
v_dual_fmac_f32 v95, v199, v208 :: v_dual_fmac_f32 v96, v198, v209
|
|
v_dual_fmac_f32 v101, v186, v208 :: v_dual_fmac_f32 v98, v187, v209
|
|
v_dual_fmac_f32 v99, v186, v209 :: v_dual_fmac_f32 v100, v187, v208
|
|
v_dual_fmac_f32 v105, v186, v212 :: v_dual_fmac_f32 v102, v187, v213
|
|
v_dual_fmac_f32 v103, v187, v212 :: v_dual_fmac_f32 v104, v186, v213
|
|
v_dual_fmac_f32 v109, v190, v212 :: v_dual_fmac_f32 v106, v191, v213
|
|
v_dual_fmac_f32 v107, v190, v213 :: v_dual_fmac_f32 v108, v191, v212
|
|
v_dual_fmac_f32 v113, v190, v208 :: v_dual_fmac_f32 v110, v191, v209
|
|
v_dual_fmac_f32 v111, v191, v208 :: v_dual_fmac_f32 v112, v190, v209
|
|
v_dual_fmac_f32 v117, v194, v208 :: v_dual_fmac_f32 v114, v195, v209
|
|
v_dual_fmac_f32 v115, v194, v209 :: v_dual_fmac_f32 v116, v195, v208
|
|
v_dual_fmac_f32 v121, v194, v212 :: v_dual_fmac_f32 v122, v195, v213
|
|
v_dual_fmac_f32 v123, v195, v212 :: v_dual_fmac_f32 v120, v194, v213
|
|
v_dual_fmac_f32 v129, v198, v212 :: v_dual_fmac_f32 v126, v199, v213
|
|
v_dual_fmac_f32 v127, v198, v213 :: v_dual_fmac_f32 v124, v199, v212
|
|
v_dual_fmac_f32 v133, v198, v184 :: v_dual_fmac_f32 v214, v199, v185
|
|
v_dual_fmac_f32 v131, v199, v184 :: v_dual_fmac_f32 v128, v198, v185
|
|
|
|
s_setprio 0
|
|
|
|
s_clause 0x1
|
|
global_load_b32 v175, v215, s[40:41]
|
|
global_load_b32 v176, v215, s[42:43]
|
|
|
|
s_clause 0xB;
|
|
;A on bank 2-3
|
|
ds_load_b64 v[186:187], v183
|
|
ds_load_b64 v[190:191], v183 offset: 8
|
|
ds_load_b64 v[194:195], v183 offset: 64
|
|
ds_load_b64 v[198:199], v183 offset: 72
|
|
|
|
|
|
;B on bank 0-1
|
|
ds_load_b64 v[184:185], v202
|
|
ds_load_b64 v[188:189], v202 offset: 8
|
|
ds_load_b64 v[192:193], v202 offset: 128
|
|
ds_load_b64 v[196:197], v202 offset: 136
|
|
ds_load_b64 v[200:201], v202 offset: 256
|
|
ds_load_b64 v[204:205], v202 offset: 264
|
|
ds_load_b64 v[208:209], v202 offset: 384
|
|
ds_load_b64 v[212:213], v202 offset: 392
|
|
|
|
v_add_nc_u32_e32 v183, 0x210, v183
|
|
v_add_nc_u32_e32 v202, 0x200, v202
|
|
;s_cmpk_lg_i32 s14, 0x1000
|
|
s_waitcnt lgkmcnt(0)
|
|
|
|
; ; new vpgrs allocation
|
|
|
|
v_dual_fmac_f32 v5, v186, v184 :: v_dual_fmac_f32 v2, v187, v185
|
|
s_setprio 1
|
|
v_dual_fmac_f32 v3, v186, v185 :: v_dual_fmac_f32 v4, v187, v184
|
|
v_dual_fmac_f32 v9, v186, v188 :: v_dual_fmac_f32 v6, v187, v189
|
|
v_dual_fmac_f32 v7, v187, v188 :: v_dual_fmac_f32 v8, v186, v189
|
|
v_dual_fmac_f32 v13, v190, v188 :: v_dual_fmac_f32 v10, v191, v189
|
|
v_dual_fmac_f32 v11, v190, v189 :: v_dual_fmac_f32 v12, v191, v188
|
|
v_dual_fmac_f32 v17, v190, v184 :: v_dual_fmac_f32 v14, v191, v185
|
|
v_dual_fmac_f32 v15, v191, v184 :: v_dual_fmac_f32 v16, v190, v185
|
|
v_dual_fmac_f32 v21, v194, v184 :: v_dual_fmac_f32 v18, v195, v185
|
|
v_dual_fmac_f32 v19, v194, v185 :: v_dual_fmac_f32 v20, v195, v184
|
|
v_dual_fmac_f32 v25, v194, v188 :: v_dual_fmac_f32 v22, v195, v189
|
|
v_dual_fmac_f32 v23, v195, v188 :: v_dual_fmac_f32 v24, v194, v189
|
|
v_dual_fmac_f32 v29, v198, v188 :: v_dual_fmac_f32 v26, v199, v189
|
|
v_dual_fmac_f32 v27, v198, v189 :: v_dual_fmac_f32 v28, v199, v188
|
|
v_dual_fmac_f32 v33, v198, v192 :: v_dual_fmac_f32 v30, v199, v193
|
|
v_dual_fmac_f32 v31, v199, v192 :: v_dual_fmac_f32 v32, v198, v193
|
|
v_dual_fmac_f32 v37, v186, v192 :: v_dual_fmac_f32 v34, v187, v193
|
|
v_dual_fmac_f32 v35, v186, v193 :: v_dual_fmac_f32 v36, v187, v192
|
|
v_dual_fmac_f32 v41, v186, v196 :: v_dual_fmac_f32 v38, v187, v197
|
|
v_dual_fmac_f32 v39, v187, v196 :: v_dual_fmac_f32 v40, v186, v197
|
|
v_dual_fmac_f32 v45, v190, v196 :: v_dual_fmac_f32 v42, v191, v197
|
|
v_dual_fmac_f32 v43, v190, v197 :: v_dual_fmac_f32 v44, v191, v196
|
|
v_dual_fmac_f32 v49, v190, v192 :: v_dual_fmac_f32 v46, v191, v193
|
|
v_dual_fmac_f32 v47, v191, v192 :: v_dual_fmac_f32 v48, v190, v193
|
|
v_dual_fmac_f32 v53, v194, v192 :: v_dual_fmac_f32 v50, v195, v193
|
|
v_dual_fmac_f32 v51, v194, v193 :: v_dual_fmac_f32 v52, v195, v192
|
|
v_dual_fmac_f32 v57, v194, v196 :: v_dual_fmac_f32 v54, v195, v197
|
|
v_dual_fmac_f32 v55, v195, v196 :: v_dual_fmac_f32 v56, v194, v197
|
|
v_dual_fmac_f32 v61, v198, v196 :: v_dual_fmac_f32 v58, v199, v197
|
|
v_dual_fmac_f32 v59, v198, v197 :: v_dual_fmac_f32 v60, v199, v196
|
|
v_dual_fmac_f32 v65, v198, v200 :: v_dual_fmac_f32 v62, v199, v201
|
|
v_dual_fmac_f32 v63, v199, v200 :: v_dual_fmac_f32 v64, v198, v201
|
|
v_dual_fmac_f32 v69, v186, v200 :: v_dual_fmac_f32 v66, v187, v201
|
|
v_dual_fmac_f32 v67, v186, v201 :: v_dual_fmac_f32 v68, v187, v200
|
|
v_dual_fmac_f32 v73, v186, v204 :: v_dual_fmac_f32 v70, v187, v205
|
|
v_dual_fmac_f32 v71, v187, v204 :: v_dual_fmac_f32 v72, v186, v205
|
|
v_dual_fmac_f32 v77, v190, v204 :: v_dual_fmac_f32 v74, v191, v205
|
|
v_dual_fmac_f32 v75, v190, v205 :: v_dual_fmac_f32 v76, v191, v204
|
|
v_dual_fmac_f32 v81, v190, v200 :: v_dual_fmac_f32 v78, v191, v201
|
|
v_dual_fmac_f32 v79, v191, v200 :: v_dual_fmac_f32 v80, v190, v201
|
|
v_dual_fmac_f32 v85, v194, v200 :: v_dual_fmac_f32 v82, v195, v201
|
|
v_dual_fmac_f32 v83, v194, v201 :: v_dual_fmac_f32 v84, v195, v200
|
|
v_dual_fmac_f32 v89, v194, v204 :: v_dual_fmac_f32 v86, v195, v205
|
|
v_dual_fmac_f32 v87, v195, v204 :: v_dual_fmac_f32 v88, v194, v205
|
|
v_dual_fmac_f32 v93, v198, v204 :: v_dual_fmac_f32 v90, v199, v205
|
|
v_dual_fmac_f32 v91, v198, v205 :: v_dual_fmac_f32 v92, v199, v204
|
|
v_dual_fmac_f32 v97, v198, v208 :: v_dual_fmac_f32 v94, v199, v209
|
|
v_dual_fmac_f32 v95, v199, v208 :: v_dual_fmac_f32 v96, v198, v209
|
|
v_dual_fmac_f32 v101, v186, v208 :: v_dual_fmac_f32 v98, v187, v209
|
|
v_dual_fmac_f32 v99, v186, v209 :: v_dual_fmac_f32 v100, v187, v208
|
|
v_dual_fmac_f32 v105, v186, v212 :: v_dual_fmac_f32 v102, v187, v213
|
|
v_dual_fmac_f32 v103, v187, v212 :: v_dual_fmac_f32 v104, v186, v213
|
|
v_dual_fmac_f32 v109, v190, v212 :: v_dual_fmac_f32 v106, v191, v213
|
|
v_dual_fmac_f32 v107, v190, v213 :: v_dual_fmac_f32 v108, v191, v212
|
|
v_dual_fmac_f32 v113, v190, v208 :: v_dual_fmac_f32 v110, v191, v209
|
|
v_dual_fmac_f32 v111, v191, v208 :: v_dual_fmac_f32 v112, v190, v209
|
|
v_dual_fmac_f32 v117, v194, v208 :: v_dual_fmac_f32 v114, v195, v209
|
|
v_dual_fmac_f32 v115, v194, v209 :: v_dual_fmac_f32 v116, v195, v208
|
|
v_dual_fmac_f32 v121, v194, v212 :: v_dual_fmac_f32 v122, v195, v213
|
|
v_dual_fmac_f32 v123, v195, v212 :: v_dual_fmac_f32 v120, v194, v213
|
|
v_dual_fmac_f32 v129, v198, v212 :: v_dual_fmac_f32 v126, v199, v213
|
|
v_dual_fmac_f32 v127, v198, v213 :: v_dual_fmac_f32 v124, v199, v212
|
|
v_dual_fmac_f32 v133, v198, v184 :: v_dual_fmac_f32 v214, v199, v185
|
|
v_dual_fmac_f32 v131, v199, v184 :: v_dual_fmac_f32 v128, v198, v185
|
|
|
|
s_setprio 0
|
|
|
|
|
|
; Global memory read for matrix A
|
|
s_clause 0x1
|
|
global_load_b32 v177, v215, s[44:45]
|
|
global_load_b32 v178, v215, s[46:47]
|
|
|
|
s_clause 0xB;
|
|
;A on bank 2-3
|
|
ds_load_b64 v[186:187], v183
|
|
ds_load_b64 v[190:191], v183 offset: 8
|
|
ds_load_b64 v[194:195], v183 offset: 64
|
|
ds_load_b64 v[198:199], v183 offset: 72
|
|
|
|
|
|
;B on bank 0-1
|
|
ds_load_b64 v[184:185], v202
|
|
ds_load_b64 v[188:189], v202 offset: 8
|
|
ds_load_b64 v[192:193], v202 offset: 128
|
|
ds_load_b64 v[196:197], v202 offset: 136
|
|
ds_load_b64 v[200:201], v202 offset: 256
|
|
ds_load_b64 v[204:205], v202 offset: 264
|
|
ds_load_b64 v[208:209], v202 offset: 384
|
|
ds_load_b64 v[212:213], v202 offset: 392
|
|
|
|
v_add_nc_u32_e32 v183, 0x210, v183
|
|
v_add_nc_u32_e32 v202, 0x200, v202
|
|
;s_cmpk_lg_i32 s14, 0x1000
|
|
s_waitcnt lgkmcnt(0)
|
|
|
|
; ; new vpgrs allocation
|
|
|
|
v_dual_fmac_f32 v5, v186, v184 :: v_dual_fmac_f32 v2, v187, v185
|
|
s_setprio 1
|
|
v_dual_fmac_f32 v3, v186, v185 :: v_dual_fmac_f32 v4, v187, v184
|
|
v_dual_fmac_f32 v9, v186, v188 :: v_dual_fmac_f32 v6, v187, v189
|
|
v_dual_fmac_f32 v7, v187, v188 :: v_dual_fmac_f32 v8, v186, v189
|
|
v_dual_fmac_f32 v13, v190, v188 :: v_dual_fmac_f32 v10, v191, v189
|
|
v_dual_fmac_f32 v11, v190, v189 :: v_dual_fmac_f32 v12, v191, v188
|
|
v_dual_fmac_f32 v17, v190, v184 :: v_dual_fmac_f32 v14, v191, v185
|
|
v_dual_fmac_f32 v15, v191, v184 :: v_dual_fmac_f32 v16, v190, v185
|
|
v_dual_fmac_f32 v21, v194, v184 :: v_dual_fmac_f32 v18, v195, v185
|
|
v_dual_fmac_f32 v19, v194, v185 :: v_dual_fmac_f32 v20, v195, v184
|
|
v_dual_fmac_f32 v25, v194, v188 :: v_dual_fmac_f32 v22, v195, v189
|
|
v_dual_fmac_f32 v23, v195, v188 :: v_dual_fmac_f32 v24, v194, v189
|
|
v_dual_fmac_f32 v29, v198, v188 :: v_dual_fmac_f32 v26, v199, v189
|
|
v_dual_fmac_f32 v27, v198, v189 :: v_dual_fmac_f32 v28, v199, v188
|
|
v_dual_fmac_f32 v33, v198, v192 :: v_dual_fmac_f32 v30, v199, v193
|
|
v_dual_fmac_f32 v31, v199, v192 :: v_dual_fmac_f32 v32, v198, v193
|
|
v_dual_fmac_f32 v37, v186, v192 :: v_dual_fmac_f32 v34, v187, v193
|
|
v_dual_fmac_f32 v35, v186, v193 :: v_dual_fmac_f32 v36, v187, v192
|
|
v_dual_fmac_f32 v41, v186, v196 :: v_dual_fmac_f32 v38, v187, v197
|
|
v_dual_fmac_f32 v39, v187, v196 :: v_dual_fmac_f32 v40, v186, v197
|
|
v_dual_fmac_f32 v45, v190, v196 :: v_dual_fmac_f32 v42, v191, v197
|
|
v_dual_fmac_f32 v43, v190, v197 :: v_dual_fmac_f32 v44, v191, v196
|
|
v_dual_fmac_f32 v49, v190, v192 :: v_dual_fmac_f32 v46, v191, v193
|
|
v_dual_fmac_f32 v47, v191, v192 :: v_dual_fmac_f32 v48, v190, v193
|
|
v_dual_fmac_f32 v53, v194, v192 :: v_dual_fmac_f32 v50, v195, v193
|
|
v_dual_fmac_f32 v51, v194, v193 :: v_dual_fmac_f32 v52, v195, v192
|
|
v_dual_fmac_f32 v57, v194, v196 :: v_dual_fmac_f32 v54, v195, v197
|
|
v_dual_fmac_f32 v55, v195, v196 :: v_dual_fmac_f32 v56, v194, v197
|
|
v_dual_fmac_f32 v61, v198, v196 :: v_dual_fmac_f32 v58, v199, v197
|
|
v_dual_fmac_f32 v59, v198, v197 :: v_dual_fmac_f32 v60, v199, v196
|
|
v_dual_fmac_f32 v65, v198, v200 :: v_dual_fmac_f32 v62, v199, v201
|
|
v_dual_fmac_f32 v63, v199, v200 :: v_dual_fmac_f32 v64, v198, v201
|
|
v_dual_fmac_f32 v69, v186, v200 :: v_dual_fmac_f32 v66, v187, v201
|
|
v_dual_fmac_f32 v67, v186, v201 :: v_dual_fmac_f32 v68, v187, v200
|
|
v_dual_fmac_f32 v73, v186, v204 :: v_dual_fmac_f32 v70, v187, v205
|
|
v_dual_fmac_f32 v71, v187, v204 :: v_dual_fmac_f32 v72, v186, v205
|
|
v_dual_fmac_f32 v77, v190, v204 :: v_dual_fmac_f32 v74, v191, v205
|
|
v_dual_fmac_f32 v75, v190, v205 :: v_dual_fmac_f32 v76, v191, v204
|
|
v_dual_fmac_f32 v81, v190, v200 :: v_dual_fmac_f32 v78, v191, v201
|
|
v_dual_fmac_f32 v79, v191, v200 :: v_dual_fmac_f32 v80, v190, v201
|
|
v_dual_fmac_f32 v85, v194, v200 :: v_dual_fmac_f32 v82, v195, v201
|
|
v_dual_fmac_f32 v83, v194, v201 :: v_dual_fmac_f32 v84, v195, v200
|
|
v_dual_fmac_f32 v89, v194, v204 :: v_dual_fmac_f32 v86, v195, v205
|
|
v_dual_fmac_f32 v87, v195, v204 :: v_dual_fmac_f32 v88, v194, v205
|
|
v_dual_fmac_f32 v93, v198, v204 :: v_dual_fmac_f32 v90, v199, v205
|
|
v_dual_fmac_f32 v91, v198, v205 :: v_dual_fmac_f32 v92, v199, v204
|
|
v_dual_fmac_f32 v97, v198, v208 :: v_dual_fmac_f32 v94, v199, v209
|
|
v_dual_fmac_f32 v95, v199, v208 :: v_dual_fmac_f32 v96, v198, v209
|
|
v_dual_fmac_f32 v101, v186, v208 :: v_dual_fmac_f32 v98, v187, v209
|
|
v_dual_fmac_f32 v99, v186, v209 :: v_dual_fmac_f32 v100, v187, v208
|
|
v_dual_fmac_f32 v105, v186, v212 :: v_dual_fmac_f32 v102, v187, v213
|
|
v_dual_fmac_f32 v103, v187, v212 :: v_dual_fmac_f32 v104, v186, v213
|
|
v_dual_fmac_f32 v109, v190, v212 :: v_dual_fmac_f32 v106, v191, v213
|
|
v_dual_fmac_f32 v107, v190, v213 :: v_dual_fmac_f32 v108, v191, v212
|
|
v_dual_fmac_f32 v113, v190, v208 :: v_dual_fmac_f32 v110, v191, v209
|
|
v_dual_fmac_f32 v111, v191, v208 :: v_dual_fmac_f32 v112, v190, v209
|
|
v_dual_fmac_f32 v117, v194, v208 :: v_dual_fmac_f32 v114, v195, v209
|
|
v_dual_fmac_f32 v115, v194, v209 :: v_dual_fmac_f32 v116, v195, v208
|
|
v_dual_fmac_f32 v121, v194, v212 :: v_dual_fmac_f32 v122, v195, v213
|
|
v_dual_fmac_f32 v123, v195, v212 :: v_dual_fmac_f32 v120, v194, v213
|
|
v_dual_fmac_f32 v129, v198, v212 :: v_dual_fmac_f32 v126, v199, v213
|
|
v_dual_fmac_f32 v127, v198, v213 :: v_dual_fmac_f32 v124, v199, v212
|
|
v_dual_fmac_f32 v133, v198, v184 :: v_dual_fmac_f32 v214, v199, v185
|
|
v_dual_fmac_f32 v131, v199, v184 :: v_dual_fmac_f32 v128, v198, v185
|
|
|
|
s_setprio 0
|
|
|
|
s_clause 0x1
|
|
global_load_b32 v179, v215, s[48:49]
|
|
global_load_b32 v180, v215, s[50:51]
|
|
|
|
|
|
s_clause 0xB;
|
|
;A on bank 2-3
|
|
ds_load_b64 v[186:187], v183
|
|
ds_load_b64 v[190:191], v183 offset: 8
|
|
ds_load_b64 v[194:195], v183 offset: 64
|
|
ds_load_b64 v[198:199], v183 offset: 72
|
|
|
|
|
|
;B on bank 0-1
|
|
ds_load_b64 v[184:185], v202
|
|
ds_load_b64 v[188:189], v202 offset: 8
|
|
ds_load_b64 v[192:193], v202 offset: 128
|
|
ds_load_b64 v[196:197], v202 offset: 136
|
|
ds_load_b64 v[200:201], v202 offset: 256
|
|
ds_load_b64 v[204:205], v202 offset: 264
|
|
ds_load_b64 v[208:209], v202 offset: 384
|
|
ds_load_b64 v[212:213], v202 offset: 392
|
|
|
|
v_add_nc_u32_e32 v183, 0x210, v183
|
|
v_add_nc_u32_e32 v202, 0x200, v202
|
|
;s_cmpk_lg_i32 s14, 0x1000
|
|
s_waitcnt lgkmcnt(0)
|
|
|
|
; ; new vpgrs allocation
|
|
|
|
v_dual_fmac_f32 v5, v186, v184 :: v_dual_fmac_f32 v2, v187, v185
|
|
s_setprio 1
|
|
v_dual_fmac_f32 v3, v186, v185 :: v_dual_fmac_f32 v4, v187, v184
|
|
v_dual_fmac_f32 v9, v186, v188 :: v_dual_fmac_f32 v6, v187, v189
|
|
v_dual_fmac_f32 v7, v187, v188 :: v_dual_fmac_f32 v8, v186, v189
|
|
v_dual_fmac_f32 v13, v190, v188 :: v_dual_fmac_f32 v10, v191, v189
|
|
v_dual_fmac_f32 v11, v190, v189 :: v_dual_fmac_f32 v12, v191, v188
|
|
v_dual_fmac_f32 v17, v190, v184 :: v_dual_fmac_f32 v14, v191, v185
|
|
v_dual_fmac_f32 v15, v191, v184 :: v_dual_fmac_f32 v16, v190, v185
|
|
v_dual_fmac_f32 v21, v194, v184 :: v_dual_fmac_f32 v18, v195, v185
|
|
v_dual_fmac_f32 v19, v194, v185 :: v_dual_fmac_f32 v20, v195, v184
|
|
v_dual_fmac_f32 v25, v194, v188 :: v_dual_fmac_f32 v22, v195, v189
|
|
v_dual_fmac_f32 v23, v195, v188 :: v_dual_fmac_f32 v24, v194, v189
|
|
v_dual_fmac_f32 v29, v198, v188 :: v_dual_fmac_f32 v26, v199, v189
|
|
v_dual_fmac_f32 v27, v198, v189 :: v_dual_fmac_f32 v28, v199, v188
|
|
v_dual_fmac_f32 v33, v198, v192 :: v_dual_fmac_f32 v30, v199, v193
|
|
v_dual_fmac_f32 v31, v199, v192 :: v_dual_fmac_f32 v32, v198, v193
|
|
v_dual_fmac_f32 v37, v186, v192 :: v_dual_fmac_f32 v34, v187, v193
|
|
v_dual_fmac_f32 v35, v186, v193 :: v_dual_fmac_f32 v36, v187, v192
|
|
v_dual_fmac_f32 v41, v186, v196 :: v_dual_fmac_f32 v38, v187, v197
|
|
v_dual_fmac_f32 v39, v187, v196 :: v_dual_fmac_f32 v40, v186, v197
|
|
v_dual_fmac_f32 v45, v190, v196 :: v_dual_fmac_f32 v42, v191, v197
|
|
v_dual_fmac_f32 v43, v190, v197 :: v_dual_fmac_f32 v44, v191, v196
|
|
v_dual_fmac_f32 v49, v190, v192 :: v_dual_fmac_f32 v46, v191, v193
|
|
v_dual_fmac_f32 v47, v191, v192 :: v_dual_fmac_f32 v48, v190, v193
|
|
v_dual_fmac_f32 v53, v194, v192 :: v_dual_fmac_f32 v50, v195, v193
|
|
v_dual_fmac_f32 v51, v194, v193 :: v_dual_fmac_f32 v52, v195, v192
|
|
v_dual_fmac_f32 v57, v194, v196 :: v_dual_fmac_f32 v54, v195, v197
|
|
v_dual_fmac_f32 v55, v195, v196 :: v_dual_fmac_f32 v56, v194, v197
|
|
v_dual_fmac_f32 v61, v198, v196 :: v_dual_fmac_f32 v58, v199, v197
|
|
v_dual_fmac_f32 v59, v198, v197 :: v_dual_fmac_f32 v60, v199, v196
|
|
v_dual_fmac_f32 v65, v198, v200 :: v_dual_fmac_f32 v62, v199, v201
|
|
v_dual_fmac_f32 v63, v199, v200 :: v_dual_fmac_f32 v64, v198, v201
|
|
v_dual_fmac_f32 v69, v186, v200 :: v_dual_fmac_f32 v66, v187, v201
|
|
v_dual_fmac_f32 v67, v186, v201 :: v_dual_fmac_f32 v68, v187, v200
|
|
v_dual_fmac_f32 v73, v186, v204 :: v_dual_fmac_f32 v70, v187, v205
|
|
v_dual_fmac_f32 v71, v187, v204 :: v_dual_fmac_f32 v72, v186, v205
|
|
v_dual_fmac_f32 v77, v190, v204 :: v_dual_fmac_f32 v74, v191, v205
|
|
v_dual_fmac_f32 v75, v190, v205 :: v_dual_fmac_f32 v76, v191, v204
|
|
v_dual_fmac_f32 v81, v190, v200 :: v_dual_fmac_f32 v78, v191, v201
|
|
v_dual_fmac_f32 v79, v191, v200 :: v_dual_fmac_f32 v80, v190, v201
|
|
v_dual_fmac_f32 v85, v194, v200 :: v_dual_fmac_f32 v82, v195, v201
|
|
v_dual_fmac_f32 v83, v194, v201 :: v_dual_fmac_f32 v84, v195, v200
|
|
v_dual_fmac_f32 v89, v194, v204 :: v_dual_fmac_f32 v86, v195, v205
|
|
v_dual_fmac_f32 v87, v195, v204 :: v_dual_fmac_f32 v88, v194, v205
|
|
v_dual_fmac_f32 v93, v198, v204 :: v_dual_fmac_f32 v90, v199, v205
|
|
v_dual_fmac_f32 v91, v198, v205 :: v_dual_fmac_f32 v92, v199, v204
|
|
v_dual_fmac_f32 v97, v198, v208 :: v_dual_fmac_f32 v94, v199, v209
|
|
v_dual_fmac_f32 v95, v199, v208 :: v_dual_fmac_f32 v96, v198, v209
|
|
v_dual_fmac_f32 v101, v186, v208 :: v_dual_fmac_f32 v98, v187, v209
|
|
v_dual_fmac_f32 v99, v186, v209 :: v_dual_fmac_f32 v100, v187, v208
|
|
v_dual_fmac_f32 v105, v186, v212 :: v_dual_fmac_f32 v102, v187, v213
|
|
v_dual_fmac_f32 v103, v187, v212 :: v_dual_fmac_f32 v104, v186, v213
|
|
v_dual_fmac_f32 v109, v190, v212 :: v_dual_fmac_f32 v106, v191, v213
|
|
v_dual_fmac_f32 v107, v190, v213 :: v_dual_fmac_f32 v108, v191, v212
|
|
v_dual_fmac_f32 v113, v190, v208 :: v_dual_fmac_f32 v110, v191, v209
|
|
v_dual_fmac_f32 v111, v191, v208 :: v_dual_fmac_f32 v112, v190, v209
|
|
v_dual_fmac_f32 v117, v194, v208 :: v_dual_fmac_f32 v114, v195, v209
|
|
v_dual_fmac_f32 v115, v194, v209 :: v_dual_fmac_f32 v116, v195, v208
|
|
v_dual_fmac_f32 v121, v194, v212 :: v_dual_fmac_f32 v122, v195, v213
|
|
v_dual_fmac_f32 v123, v195, v212 :: v_dual_fmac_f32 v120, v194, v213
|
|
v_dual_fmac_f32 v129, v198, v212 :: v_dual_fmac_f32 v126, v199, v213
|
|
v_dual_fmac_f32 v127, v198, v213 :: v_dual_fmac_f32 v124, v199, v212
|
|
v_dual_fmac_f32 v133, v198, v184 :: v_dual_fmac_f32 v214, v199, v185
|
|
v_dual_fmac_f32 v131, v199, v184 :: v_dual_fmac_f32 v128, v198, v185
|
|
|
|
s_setprio 0
|
|
s_clause 0x1
|
|
global_load_b32 v181, v215, s[52:53]
|
|
global_load_b32 v182, v215, s[54:55]
|
|
s_clause 0xB;
|
|
;A on bank 2-3
|
|
ds_load_b64 v[186:187], v183
|
|
ds_load_b64 v[190:191], v183 offset: 8
|
|
ds_load_b64 v[194:195], v183 offset: 64
|
|
ds_load_b64 v[198:199], v183 offset: 72
|
|
|
|
|
|
;B on bank 0-1
|
|
ds_load_b64 v[184:185], v202
|
|
ds_load_b64 v[188:189], v202 offset: 8
|
|
ds_load_b64 v[192:193], v202 offset: 128
|
|
ds_load_b64 v[196:197], v202 offset: 136
|
|
ds_load_b64 v[200:201], v202 offset: 256
|
|
ds_load_b64 v[204:205], v202 offset: 264
|
|
ds_load_b64 v[208:209], v202 offset: 384
|
|
ds_load_b64 v[212:213], v202 offset: 392
|
|
|
|
v_add_nc_u32_e32 v183, 0x210, v183
|
|
v_add_nc_u32_e32 v202, 0x200, v202
|
|
s_waitcnt lgkmcnt(0)
|
|
|
|
; new vpgrs allocation
|
|
|
|
v_dual_fmac_f32 v5, v186, v184 :: v_dual_fmac_f32 v2, v187, v185
|
|
s_setprio 1
|
|
v_dual_fmac_f32 v3, v186, v185 :: v_dual_fmac_f32 v4, v187, v184
|
|
v_dual_fmac_f32 v9, v186, v188 :: v_dual_fmac_f32 v6, v187, v189
|
|
v_dual_fmac_f32 v7, v187, v188 :: v_dual_fmac_f32 v8, v186, v189
|
|
v_dual_fmac_f32 v13, v190, v188 :: v_dual_fmac_f32 v10, v191, v189
|
|
v_dual_fmac_f32 v11, v190, v189 :: v_dual_fmac_f32 v12, v191, v188
|
|
v_dual_fmac_f32 v17, v190, v184 :: v_dual_fmac_f32 v14, v191, v185
|
|
v_dual_fmac_f32 v15, v191, v184 :: v_dual_fmac_f32 v16, v190, v185
|
|
v_dual_fmac_f32 v21, v194, v184 :: v_dual_fmac_f32 v18, v195, v185
|
|
v_dual_fmac_f32 v19, v194, v185 :: v_dual_fmac_f32 v20, v195, v184
|
|
v_dual_fmac_f32 v25, v194, v188 :: v_dual_fmac_f32 v22, v195, v189
|
|
v_dual_fmac_f32 v23, v195, v188 :: v_dual_fmac_f32 v24, v194, v189
|
|
v_dual_fmac_f32 v29, v198, v188 :: v_dual_fmac_f32 v26, v199, v189
|
|
v_dual_fmac_f32 v27, v198, v189 :: v_dual_fmac_f32 v28, v199, v188
|
|
v_dual_fmac_f32 v33, v198, v192 :: v_dual_fmac_f32 v30, v199, v193
|
|
v_dual_fmac_f32 v31, v199, v192 :: v_dual_fmac_f32 v32, v198, v193
|
|
v_dual_fmac_f32 v37, v186, v192 :: v_dual_fmac_f32 v34, v187, v193
|
|
v_dual_fmac_f32 v35, v186, v193 :: v_dual_fmac_f32 v36, v187, v192
|
|
v_dual_fmac_f32 v41, v186, v196 :: v_dual_fmac_f32 v38, v187, v197
|
|
v_dual_fmac_f32 v39, v187, v196 :: v_dual_fmac_f32 v40, v186, v197
|
|
v_dual_fmac_f32 v45, v190, v196 :: v_dual_fmac_f32 v42, v191, v197
|
|
v_dual_fmac_f32 v43, v190, v197 :: v_dual_fmac_f32 v44, v191, v196
|
|
v_dual_fmac_f32 v49, v190, v192 :: v_dual_fmac_f32 v46, v191, v193
|
|
v_dual_fmac_f32 v47, v191, v192 :: v_dual_fmac_f32 v48, v190, v193
|
|
v_dual_fmac_f32 v53, v194, v192 :: v_dual_fmac_f32 v50, v195, v193
|
|
v_dual_fmac_f32 v51, v194, v193 :: v_dual_fmac_f32 v52, v195, v192
|
|
v_dual_fmac_f32 v57, v194, v196 :: v_dual_fmac_f32 v54, v195, v197
|
|
v_dual_fmac_f32 v55, v195, v196 :: v_dual_fmac_f32 v56, v194, v197
|
|
v_dual_fmac_f32 v61, v198, v196 :: v_dual_fmac_f32 v58, v199, v197
|
|
v_dual_fmac_f32 v59, v198, v197 :: v_dual_fmac_f32 v60, v199, v196
|
|
v_dual_fmac_f32 v65, v198, v200 :: v_dual_fmac_f32 v62, v199, v201
|
|
v_dual_fmac_f32 v63, v199, v200 :: v_dual_fmac_f32 v64, v198, v201
|
|
v_dual_fmac_f32 v69, v186, v200 :: v_dual_fmac_f32 v66, v187, v201
|
|
v_dual_fmac_f32 v67, v186, v201 :: v_dual_fmac_f32 v68, v187, v200
|
|
v_dual_fmac_f32 v73, v186, v204 :: v_dual_fmac_f32 v70, v187, v205
|
|
v_dual_fmac_f32 v71, v187, v204 :: v_dual_fmac_f32 v72, v186, v205
|
|
v_dual_fmac_f32 v77, v190, v204 :: v_dual_fmac_f32 v74, v191, v205
|
|
v_dual_fmac_f32 v75, v190, v205 :: v_dual_fmac_f32 v76, v191, v204
|
|
v_dual_fmac_f32 v81, v190, v200 :: v_dual_fmac_f32 v78, v191, v201
|
|
v_dual_fmac_f32 v79, v191, v200 :: v_dual_fmac_f32 v80, v190, v201
|
|
v_dual_fmac_f32 v85, v194, v200 :: v_dual_fmac_f32 v82, v195, v201
|
|
v_dual_fmac_f32 v83, v194, v201 :: v_dual_fmac_f32 v84, v195, v200
|
|
v_dual_fmac_f32 v89, v194, v204 :: v_dual_fmac_f32 v86, v195, v205
|
|
v_dual_fmac_f32 v87, v195, v204 :: v_dual_fmac_f32 v88, v194, v205
|
|
v_dual_fmac_f32 v93, v198, v204 :: v_dual_fmac_f32 v90, v199, v205
|
|
v_dual_fmac_f32 v91, v198, v205 :: v_dual_fmac_f32 v92, v199, v204
|
|
v_dual_fmac_f32 v97, v198, v208 :: v_dual_fmac_f32 v94, v199, v209
|
|
v_dual_fmac_f32 v95, v199, v208 :: v_dual_fmac_f32 v96, v198, v209
|
|
v_dual_fmac_f32 v101, v186, v208 :: v_dual_fmac_f32 v98, v187, v209
|
|
v_dual_fmac_f32 v99, v186, v209 :: v_dual_fmac_f32 v100, v187, v208
|
|
v_dual_fmac_f32 v105, v186, v212 :: v_dual_fmac_f32 v102, v187, v213
|
|
v_dual_fmac_f32 v103, v187, v212 :: v_dual_fmac_f32 v104, v186, v213
|
|
v_dual_fmac_f32 v109, v190, v212 :: v_dual_fmac_f32 v106, v191, v213
|
|
v_dual_fmac_f32 v107, v190, v213 :: v_dual_fmac_f32 v108, v191, v212
|
|
v_dual_fmac_f32 v113, v190, v208 :: v_dual_fmac_f32 v110, v191, v209
|
|
v_dual_fmac_f32 v111, v191, v208 :: v_dual_fmac_f32 v112, v190, v209
|
|
v_dual_fmac_f32 v117, v194, v208 :: v_dual_fmac_f32 v114, v195, v209
|
|
v_dual_fmac_f32 v115, v194, v209 :: v_dual_fmac_f32 v116, v195, v208
|
|
v_dual_fmac_f32 v121, v194, v212 :: v_dual_fmac_f32 v122, v195, v213
|
|
v_dual_fmac_f32 v123, v195, v212 :: v_dual_fmac_f32 v120, v194, v213
|
|
v_dual_fmac_f32 v129, v198, v212 :: v_dual_fmac_f32 v126, v199, v213
|
|
v_dual_fmac_f32 v127, v198, v213 :: v_dual_fmac_f32 v124, v199, v212
|
|
v_dual_fmac_f32 v133, v198, v184 :: v_dual_fmac_f32 v214, v199, v185
|
|
v_dual_fmac_f32 v131, v199, v184 :: v_dual_fmac_f32 v128, v198, v185
|
|
|
|
s_setprio 0
|
|
|
|
|
|
|
|
|
|
|
|
s_clause 0xB;
|
|
;A on bank 2-3
|
|
ds_load_b64 v[186:187], v183
|
|
ds_load_b64 v[190:191], v183 offset: 8
|
|
ds_load_b64 v[194:195], v183 offset: 64
|
|
ds_load_b64 v[198:199], v183 offset: 72
|
|
|
|
|
|
;B on bank 0-1
|
|
ds_load_b64 v[184:185], v202
|
|
ds_load_b64 v[188:189], v202 offset: 8
|
|
ds_load_b64 v[192:193], v202 offset: 128
|
|
ds_load_b64 v[196:197], v202 offset: 136
|
|
ds_load_b64 v[200:201], v202 offset: 256
|
|
ds_load_b64 v[204:205], v202 offset: 264
|
|
ds_load_b64 v[208:209], v202 offset: 384
|
|
ds_load_b64 v[212:213], v202 offset: 392
|
|
|
|
v_add_nc_u32_e32 v183, 0x210, v183
|
|
v_add_nc_u32_e32 v202, 0x200, v202
|
|
;s_cmpk_lg_i32 s14, 0x1000
|
|
s_waitcnt lgkmcnt(0)
|
|
|
|
; ; new vpgrs allocation
|
|
|
|
v_dual_fmac_f32 v5, v186, v184 :: v_dual_fmac_f32 v2, v187, v185
|
|
s_setprio 1
|
|
v_dual_fmac_f32 v3, v186, v185 :: v_dual_fmac_f32 v4, v187, v184
|
|
v_dual_fmac_f32 v9, v186, v188 :: v_dual_fmac_f32 v6, v187, v189
|
|
v_dual_fmac_f32 v7, v187, v188 :: v_dual_fmac_f32 v8, v186, v189
|
|
v_dual_fmac_f32 v13, v190, v188 :: v_dual_fmac_f32 v10, v191, v189
|
|
v_dual_fmac_f32 v11, v190, v189 :: v_dual_fmac_f32 v12, v191, v188
|
|
v_dual_fmac_f32 v17, v190, v184 :: v_dual_fmac_f32 v14, v191, v185
|
|
v_dual_fmac_f32 v15, v191, v184 :: v_dual_fmac_f32 v16, v190, v185
|
|
v_dual_fmac_f32 v21, v194, v184 :: v_dual_fmac_f32 v18, v195, v185
|
|
v_dual_fmac_f32 v19, v194, v185 :: v_dual_fmac_f32 v20, v195, v184
|
|
v_dual_fmac_f32 v25, v194, v188 :: v_dual_fmac_f32 v22, v195, v189
|
|
v_dual_fmac_f32 v23, v195, v188 :: v_dual_fmac_f32 v24, v194, v189
|
|
v_dual_fmac_f32 v29, v198, v188 :: v_dual_fmac_f32 v26, v199, v189
|
|
v_dual_fmac_f32 v27, v198, v189 :: v_dual_fmac_f32 v28, v199, v188
|
|
v_dual_fmac_f32 v33, v198, v192 :: v_dual_fmac_f32 v30, v199, v193
|
|
v_dual_fmac_f32 v31, v199, v192 :: v_dual_fmac_f32 v32, v198, v193
|
|
v_dual_fmac_f32 v37, v186, v192 :: v_dual_fmac_f32 v34, v187, v193
|
|
v_dual_fmac_f32 v35, v186, v193 :: v_dual_fmac_f32 v36, v187, v192
|
|
v_dual_fmac_f32 v41, v186, v196 :: v_dual_fmac_f32 v38, v187, v197
|
|
v_dual_fmac_f32 v39, v187, v196 :: v_dual_fmac_f32 v40, v186, v197
|
|
v_dual_fmac_f32 v45, v190, v196 :: v_dual_fmac_f32 v42, v191, v197
|
|
v_dual_fmac_f32 v43, v190, v197 :: v_dual_fmac_f32 v44, v191, v196
|
|
v_dual_fmac_f32 v49, v190, v192 :: v_dual_fmac_f32 v46, v191, v193
|
|
v_dual_fmac_f32 v47, v191, v192 :: v_dual_fmac_f32 v48, v190, v193
|
|
v_dual_fmac_f32 v53, v194, v192 :: v_dual_fmac_f32 v50, v195, v193
|
|
v_dual_fmac_f32 v51, v194, v193 :: v_dual_fmac_f32 v52, v195, v192
|
|
v_dual_fmac_f32 v57, v194, v196 :: v_dual_fmac_f32 v54, v195, v197
|
|
v_dual_fmac_f32 v55, v195, v196 :: v_dual_fmac_f32 v56, v194, v197
|
|
v_dual_fmac_f32 v61, v198, v196 :: v_dual_fmac_f32 v58, v199, v197
|
|
v_dual_fmac_f32 v59, v198, v197 :: v_dual_fmac_f32 v60, v199, v196
|
|
v_dual_fmac_f32 v65, v198, v200 :: v_dual_fmac_f32 v62, v199, v201
|
|
v_dual_fmac_f32 v63, v199, v200 :: v_dual_fmac_f32 v64, v198, v201
|
|
v_dual_fmac_f32 v69, v186, v200 :: v_dual_fmac_f32 v66, v187, v201
|
|
v_dual_fmac_f32 v67, v186, v201 :: v_dual_fmac_f32 v68, v187, v200
|
|
v_dual_fmac_f32 v73, v186, v204 :: v_dual_fmac_f32 v70, v187, v205
|
|
v_dual_fmac_f32 v71, v187, v204 :: v_dual_fmac_f32 v72, v186, v205
|
|
v_dual_fmac_f32 v77, v190, v204 :: v_dual_fmac_f32 v74, v191, v205
|
|
v_dual_fmac_f32 v75, v190, v205 :: v_dual_fmac_f32 v76, v191, v204
|
|
v_dual_fmac_f32 v81, v190, v200 :: v_dual_fmac_f32 v78, v191, v201
|
|
v_dual_fmac_f32 v79, v191, v200 :: v_dual_fmac_f32 v80, v190, v201
|
|
v_dual_fmac_f32 v85, v194, v200 :: v_dual_fmac_f32 v82, v195, v201
|
|
v_dual_fmac_f32 v83, v194, v201 :: v_dual_fmac_f32 v84, v195, v200
|
|
v_dual_fmac_f32 v89, v194, v204 :: v_dual_fmac_f32 v86, v195, v205
|
|
v_dual_fmac_f32 v87, v195, v204 :: v_dual_fmac_f32 v88, v194, v205
|
|
v_dual_fmac_f32 v93, v198, v204 :: v_dual_fmac_f32 v90, v199, v205
|
|
v_dual_fmac_f32 v91, v198, v205 :: v_dual_fmac_f32 v92, v199, v204
|
|
v_dual_fmac_f32 v97, v198, v208 :: v_dual_fmac_f32 v94, v199, v209
|
|
v_dual_fmac_f32 v95, v199, v208 :: v_dual_fmac_f32 v96, v198, v209
|
|
v_dual_fmac_f32 v101, v186, v208 :: v_dual_fmac_f32 v98, v187, v209
|
|
v_dual_fmac_f32 v99, v186, v209 :: v_dual_fmac_f32 v100, v187, v208
|
|
v_dual_fmac_f32 v105, v186, v212 :: v_dual_fmac_f32 v102, v187, v213
|
|
v_dual_fmac_f32 v103, v187, v212 :: v_dual_fmac_f32 v104, v186, v213
|
|
v_dual_fmac_f32 v109, v190, v212 :: v_dual_fmac_f32 v106, v191, v213
|
|
v_dual_fmac_f32 v107, v190, v213 :: v_dual_fmac_f32 v108, v191, v212
|
|
v_dual_fmac_f32 v113, v190, v208 :: v_dual_fmac_f32 v110, v191, v209
|
|
v_dual_fmac_f32 v111, v191, v208 :: v_dual_fmac_f32 v112, v190, v209
|
|
v_dual_fmac_f32 v117, v194, v208 :: v_dual_fmac_f32 v114, v195, v209
|
|
v_dual_fmac_f32 v115, v194, v209 :: v_dual_fmac_f32 v116, v195, v208
|
|
v_dual_fmac_f32 v121, v194, v212 :: v_dual_fmac_f32 v122, v195, v213
|
|
v_dual_fmac_f32 v123, v195, v212 :: v_dual_fmac_f32 v120, v194, v213
|
|
v_dual_fmac_f32 v129, v198, v212 :: v_dual_fmac_f32 v126, v199, v213
|
|
v_dual_fmac_f32 v127, v198, v213 :: v_dual_fmac_f32 v124, v199, v212
|
|
v_dual_fmac_f32 v133, v198, v184 :: v_dual_fmac_f32 v214, v199, v185
|
|
v_dual_fmac_f32 v131, v199, v184 :: v_dual_fmac_f32 v128, v198, v185
|
|
|
|
s_setprio 0
|
|
|
|
|
|
|
|
;s_cbranch_scc1 .LBB0_9
|
|
; %bb.10: ; in Loop: Header=BB0_6 Depth=1
|
|
s_and_not1_b32 vcc_lo, exec_lo, s13
|
|
s_waitcnt vmcnt(0)
|
|
s_barrier
|
|
s_cbranch_vccnz .LBB0_5
|
|
; %bb.11: ; %.preheader190.preheader
|
|
; in Loop: Header=BB0_6 Depth=1
|
|
ds_store_b32 v155, v167
|
|
ds_store_b32 v141, v175
|
|
ds_store_b32 v158, v168
|
|
ds_store_b32 v142, v176
|
|
ds_store_b32 v159, v169
|
|
ds_store_b32 v143, v177
|
|
ds_store_b32 v160, v170
|
|
ds_store_b32 v144, v178
|
|
ds_store_b32 v161, v171
|
|
ds_store_b32 v145, v179
|
|
ds_store_b32 v162, v172
|
|
ds_store_b32 v146, v180
|
|
ds_store_b32 v163, v173
|
|
ds_store_b32 v147, v181
|
|
ds_store_b32 v164, v174
|
|
ds_store_b32 v148, v182
|
|
s_waitcnt lgkmcnt(0)
|
|
s_barrier
|
|
s_branch .LBB0_5
|
|
|
|
.LBB0_12: ; %Flow
|
|
;Restoring VGPR original allocation
|
|
; v2 -> v128 & v128 -> v2
|
|
v_mov_b32 v200, v128
|
|
v_mov_b32 v128, v2
|
|
v_mov_b32 v2, v200
|
|
; v128 -> v56 & v56 -> v128
|
|
v_mov_b32 v200, v56
|
|
v_mov_b32 v56, v2
|
|
v_mov_b32 v2, v200
|
|
; v56 -> v46 & v46 -> v56
|
|
v_mov_b32 v200, v46
|
|
v_mov_b32 v46, v2
|
|
v_mov_b32 v2, v200
|
|
; v46 -> v100 & v100 -> v46
|
|
v_mov_b32 v200, v100
|
|
v_mov_b32 v100, v2
|
|
v_mov_b32 v2, v200
|
|
; v100 -> v77 & v77 -> v100
|
|
v_mov_b32 v200, v77
|
|
v_mov_b32 v77, v2
|
|
v_mov_b32 v2, v200
|
|
; v77 -> v87 & v87 -> v77
|
|
v_mov_b32 v200, v87
|
|
v_mov_b32 v87, v2
|
|
v_mov_b32 v2, v200
|
|
; v87 -> v27 & v27 -> v87
|
|
v_mov_b32 v200, v27
|
|
v_mov_b32 v27, v2
|
|
v_mov_b32 v2, v200
|
|
; v27 -> v54 & v54 -> v27
|
|
v_mov_b32 v200, v54
|
|
v_mov_b32 v54, v2
|
|
v_mov_b32 v2, v200
|
|
; v54 -> v42 & v42 -> v54
|
|
v_mov_b32 v200, v42
|
|
v_mov_b32 v42, v2
|
|
v_mov_b32 v2, v200
|
|
; v42 -> v98 & v98 -> v42
|
|
v_mov_b32 v200, v98
|
|
v_mov_b32 v98, v2
|
|
v_mov_b32 v2, v200
|
|
; v98 -> v76 & v76 -> v98
|
|
v_mov_b32 v200, v76
|
|
v_mov_b32 v76, v2
|
|
v_mov_b32 v2, v200
|
|
; v76 -> v83 & v83 -> v76
|
|
v_mov_b32 v200, v83
|
|
v_mov_b32 v83, v2
|
|
v_mov_b32 v2, v200
|
|
; v83 -> v32 & v32 -> v83
|
|
v_mov_b32 v200, v32
|
|
v_mov_b32 v32, v2
|
|
v_mov_b32 v2, v200
|
|
; v32 -> v40 & v40 -> v32
|
|
v_mov_b32 v200, v40
|
|
v_mov_b32 v40, v2
|
|
v_mov_b32 v2, v200
|
|
; v40 -> v110 & v110 -> v40
|
|
v_mov_b32 v200, v110
|
|
v_mov_b32 v110, v2
|
|
v_mov_b32 v2, v200
|
|
; v110 -> v68 & v68 -> v110
|
|
v_mov_b32 v200, v68
|
|
v_mov_b32 v68, v2
|
|
v_mov_b32 v2, v200
|
|
; v68 -> v93 & v93 -> v68
|
|
v_mov_b32 v200, v93
|
|
v_mov_b32 v93, v2
|
|
v_mov_b32 v2, v200
|
|
; v93 -> v23 & v23 -> v93
|
|
v_mov_b32 v200, v23
|
|
v_mov_b32 v23, v2
|
|
v_mov_b32 v2, v200
|
|
; v23 -> v59 & v59 -> v23
|
|
v_mov_b32 v200, v59
|
|
v_mov_b32 v59, v2
|
|
v_mov_b32 v2, v200
|
|
; v59 -> v38 & v38 -> v59
|
|
v_mov_b32 v200, v38
|
|
v_mov_b32 v38, v2
|
|
v_mov_b32 v2, v200
|
|
; v38 -> v106 & v106 -> v38
|
|
v_mov_b32 v200, v106
|
|
v_mov_b32 v106, v2
|
|
v_mov_b32 v2, v200
|
|
; v106 -> v66 & v66 -> v106
|
|
v_mov_b32 v200, v66
|
|
v_mov_b32 v66, v2
|
|
v_mov_b32 v2, v200
|
|
; v66 -> v92 & v92 -> v66
|
|
v_mov_b32 v200, v92
|
|
v_mov_b32 v92, v2
|
|
v_mov_b32 v2, v200
|
|
; v92 -> v19 & v19 -> v92
|
|
v_mov_b32 v200, v19
|
|
v_mov_b32 v19, v2
|
|
v_mov_b32 v2, v200
|
|
; v19 -> v64 & v64 -> v19
|
|
v_mov_b32 v200, v64
|
|
v_mov_b32 v64, v2
|
|
v_mov_b32 v2, v200
|
|
; v64 -> v24 & v24 -> v64
|
|
v_mov_b32 v200, v24
|
|
v_mov_b32 v24, v2
|
|
v_mov_b32 v2, v200
|
|
; v24 -> v62 & v62 -> v24
|
|
v_mov_b32 v200, v62
|
|
v_mov_b32 v62, v2
|
|
v_mov_b32 v2, v200
|
|
; v62 -> v20 & v20 -> v62
|
|
v_mov_b32 v200, v20
|
|
v_mov_b32 v20, v2
|
|
v_mov_b32 v2, v200
|
|
; v20 -> v61 & v61 -> v20
|
|
v_mov_b32 v200, v61
|
|
v_mov_b32 v61, v2
|
|
v_mov_b32 v2, v200
|
|
; v61 -> v39 & v39 -> v61
|
|
v_mov_b32 v200, v39
|
|
v_mov_b32 v39, v2
|
|
v_mov_b32 v2, v200
|
|
; v39 -> v107 & v107 -> v39
|
|
v_mov_b32 v200, v107
|
|
v_mov_b32 v107, v2
|
|
v_mov_b32 v2, v200
|
|
; v107 -> v70 & v70 -> v107
|
|
v_mov_b32 v200, v70
|
|
v_mov_b32 v70, v2
|
|
v_mov_b32 v2, v200
|
|
; v70 -> v90 & v90 -> v70
|
|
v_mov_b32 v200, v90
|
|
v_mov_b32 v90, v2
|
|
v_mov_b32 v2, v200
|
|
; v90 -> v18 & v18 -> v90
|
|
v_mov_b32 v200, v18
|
|
v_mov_b32 v18, v2
|
|
v_mov_b32 v2, v200
|
|
; v18 -> v60 & v60 -> v18
|
|
v_mov_b32 v200, v60
|
|
v_mov_b32 v60, v2
|
|
v_mov_b32 v2, v200
|
|
; v60 -> v35 & v35 -> v60
|
|
v_mov_b32 v200, v35
|
|
v_mov_b32 v35, v2
|
|
v_mov_b32 v2, v200
|
|
; v35 -> v112 & v112 -> v35
|
|
v_mov_b32 v200, v112
|
|
v_mov_b32 v112, v2
|
|
v_mov_b32 v2, v200
|
|
; v112 -> v72 & v72 -> v112
|
|
v_mov_b32 v200, v72
|
|
v_mov_b32 v72, v2
|
|
v_mov_b32 v2, v200
|
|
; v72 -> v94 & v94 -> v72
|
|
v_mov_b32 v200, v94
|
|
v_mov_b32 v94, v2
|
|
v_mov_b32 v2, v200
|
|
; v94 -> v4 & v4 -> v94
|
|
v_mov_b32 v200, v4
|
|
v_mov_b32 v4, v2
|
|
v_mov_b32 v2, v200
|
|
; v4 -> v129 & v129 -> v4
|
|
v_mov_b32 v200, v129
|
|
v_mov_b32 v129, v2
|
|
v_mov_b32 v2, v200
|
|
; v129 -> v7 & v7 -> v129
|
|
v_mov_b32 v200, v7
|
|
v_mov_b32 v7, v2
|
|
v_mov_b32 v2, v200
|
|
; v7 -> v127 & v127 -> v7
|
|
v_mov_b32 v200, v127
|
|
v_mov_b32 v127, v2
|
|
v_mov_b32 v2, v200
|
|
; v127 -> v6 & v6 -> v127
|
|
v_mov_b32 v200, v6
|
|
v_mov_b32 v6, v2
|
|
v_mov_b32 v2, v200
|
|
; v6 -> v126 & v126 -> v6
|
|
v_mov_b32 v200, v126
|
|
v_mov_b32 v126, v2
|
|
v_mov_b32 v2, v200
|
|
;NOP 126(2) -> 2
|
|
; v3 -> v133 & v133 -> v3
|
|
v_mov_b32 v200, v133
|
|
v_mov_b32 v133, v3
|
|
v_mov_b32 v3, v200
|
|
; v133 -> v57 & v57 -> v133
|
|
v_mov_b32 v200, v57
|
|
v_mov_b32 v57, v3
|
|
v_mov_b32 v3, v200
|
|
; v57 -> v47 & v47 -> v57
|
|
v_mov_b32 v200, v47
|
|
v_mov_b32 v47, v3
|
|
v_mov_b32 v3, v200
|
|
; v47 -> v101 & v101 -> v47
|
|
v_mov_b32 v200, v101
|
|
v_mov_b32 v101, v3
|
|
v_mov_b32 v3, v200
|
|
; v101 -> v81 & v81 -> v101
|
|
v_mov_b32 v200, v81
|
|
v_mov_b32 v81, v3
|
|
v_mov_b32 v3, v200
|
|
; v81 -> v89 & v89 -> v81
|
|
v_mov_b32 v200, v89
|
|
v_mov_b32 v89, v3
|
|
v_mov_b32 v3, v200
|
|
; v89 -> v31 & v31 -> v89
|
|
v_mov_b32 v200, v31
|
|
v_mov_b32 v31, v3
|
|
v_mov_b32 v3, v200
|
|
; v31 -> v37 & v37 -> v31
|
|
v_mov_b32 v200, v37
|
|
v_mov_b32 v37, v3
|
|
v_mov_b32 v3, v200
|
|
; v37 -> v113 & v113 -> v37
|
|
v_mov_b32 v200, v113
|
|
v_mov_b32 v113, v3
|
|
v_mov_b32 v3, v200
|
|
; v113 -> v73 & v73 -> v113
|
|
v_mov_b32 v200, v73
|
|
v_mov_b32 v73, v3
|
|
v_mov_b32 v3, v200
|
|
; v73 -> v95 & v95 -> v73
|
|
v_mov_b32 v200, v95
|
|
v_mov_b32 v95, v3
|
|
v_mov_b32 v3, v200
|
|
; v95 -> v5 & v5 -> v95
|
|
v_mov_b32 v200, v5
|
|
v_mov_b32 v5, v3
|
|
v_mov_b32 v3, v200
|
|
; v5 -> v124 & v124 -> v5
|
|
v_mov_b32 v200, v124
|
|
v_mov_b32 v124, v3
|
|
v_mov_b32 v3, v200
|
|
;NOP 124(3) -> 3
|
|
; v8 -> v131 & v131 -> v8
|
|
v_mov_b32 v200, v131
|
|
v_mov_b32 v131, v8
|
|
v_mov_b32 v8, v200
|
|
; v131 -> v53 & v53 -> v131
|
|
v_mov_b32 v200, v53
|
|
v_mov_b32 v53, v8
|
|
v_mov_b32 v8, v200
|
|
; v53 -> v49 & v49 -> v53
|
|
v_mov_b32 v200, v49
|
|
v_mov_b32 v49, v8
|
|
v_mov_b32 v8, v200
|
|
; v49 -> v105 & v105 -> v49
|
|
v_mov_b32 v200, v105
|
|
v_mov_b32 v105, v8
|
|
v_mov_b32 v8, v200
|
|
; v105 -> v79 & v79 -> v105
|
|
v_mov_b32 v200, v79
|
|
v_mov_b32 v79, v8
|
|
v_mov_b32 v8, v200
|
|
; v79 -> v85 & v85 -> v79
|
|
v_mov_b32 v200, v85
|
|
v_mov_b32 v85, v8
|
|
v_mov_b32 v8, v200
|
|
; v85 -> v33 & v33 -> v85
|
|
v_mov_b32 v200, v33
|
|
v_mov_b32 v33, v8
|
|
v_mov_b32 v8, v200
|
|
; v33 -> v41 & v41 -> v33
|
|
v_mov_b32 v200, v41
|
|
v_mov_b32 v41, v8
|
|
v_mov_b32 v8, v200
|
|
; v41 -> v111 & v111 -> v41
|
|
v_mov_b32 v200, v111
|
|
v_mov_b32 v111, v8
|
|
v_mov_b32 v8, v200
|
|
; v111 -> v69 & v69 -> v111
|
|
v_mov_b32 v200, v69
|
|
v_mov_b32 v69, v8
|
|
v_mov_b32 v8, v200
|
|
; v69 -> v97 & v97 -> v69
|
|
v_mov_b32 v200, v97
|
|
v_mov_b32 v97, v8
|
|
v_mov_b32 v8, v200
|
|
; v97 -> v9 & v9 -> v97
|
|
v_mov_b32 v200, v9
|
|
v_mov_b32 v9, v8
|
|
v_mov_b32 v8, v200
|
|
; v9 -> v132 & v132 -> v9
|
|
v_mov_b32 v200, v132
|
|
v_mov_b32 v132, v8
|
|
v_mov_b32 v8, v200
|
|
; v10 -> v114 & v114 -> v10
|
|
v_mov_b32 v200, v114
|
|
v_mov_b32 v114, v10
|
|
v_mov_b32 v10, v200
|
|
; v114 -> v12 & v12 -> v114
|
|
v_mov_b32 v200, v12
|
|
v_mov_b32 v12, v10
|
|
v_mov_b32 v10, v200
|
|
; v12 -> v115 & v115 -> v12
|
|
v_mov_b32 v200, v115
|
|
v_mov_b32 v115, v10
|
|
v_mov_b32 v10, v200
|
|
; v115 -> v16 & v16 -> v115
|
|
v_mov_b32 v200, v16
|
|
v_mov_b32 v16, v10
|
|
v_mov_b32 v10, v200
|
|
; v16 -> v122 & v122 -> v16
|
|
v_mov_b32 v200, v122
|
|
v_mov_b32 v122, v10
|
|
v_mov_b32 v10, v200
|
|
;NOP 122(10) -> 10
|
|
; v11 -> v120 & v120 -> v11
|
|
v_mov_b32 v200, v120
|
|
v_mov_b32 v120, v11
|
|
v_mov_b32 v11, v200
|
|
; v120 -> v14 & v14 -> v120
|
|
v_mov_b32 v200, v14
|
|
v_mov_b32 v14, v11
|
|
v_mov_b32 v11, v200
|
|
; v14 -> v116 & v116 -> v14
|
|
v_mov_b32 v200, v116
|
|
v_mov_b32 v116, v11
|
|
v_mov_b32 v11, v200
|
|
; v116 -> v13 & v13 -> v116
|
|
v_mov_b32 v200, v13
|
|
v_mov_b32 v13, v11
|
|
v_mov_b32 v11, v200
|
|
; v13 -> v121 & v121 -> v13
|
|
v_mov_b32 v200, v121
|
|
v_mov_b32 v121, v11
|
|
v_mov_b32 v11, v200
|
|
; v121 -> v15 & v15 -> v121
|
|
v_mov_b32 v200, v15
|
|
v_mov_b32 v15, v11
|
|
v_mov_b32 v11, v200
|
|
; v15 -> v117 & v117 -> v15
|
|
v_mov_b32 v200, v117
|
|
v_mov_b32 v117, v11
|
|
v_mov_b32 v11, v200
|
|
; v117 -> v17 & v17 -> v117
|
|
v_mov_b32 v200, v17
|
|
v_mov_b32 v17, v11
|
|
v_mov_b32 v11, v200
|
|
; v17 -> v123 & v123 -> v17
|
|
v_mov_b32 v200, v123
|
|
v_mov_b32 v123, v11
|
|
v_mov_b32 v11, v200
|
|
;NOP 123(11) -> 11
|
|
; v21 -> v65 & v65 -> v21
|
|
v_mov_b32 v200, v65
|
|
v_mov_b32 v65, v21
|
|
v_mov_b32 v21, v200
|
|
; v65 -> v25 & v25 -> v65
|
|
v_mov_b32 v200, v25
|
|
v_mov_b32 v25, v21
|
|
v_mov_b32 v21, v200
|
|
; v25 -> v63 & v63 -> v25
|
|
v_mov_b32 v200, v63
|
|
v_mov_b32 v63, v21
|
|
v_mov_b32 v21, v200
|
|
;NOP 63(21) -> 21
|
|
; v22 -> v58 & v58 -> v22
|
|
v_mov_b32 v200, v58
|
|
v_mov_b32 v58, v22
|
|
v_mov_b32 v22, v200
|
|
; v58 -> v34 & v34 -> v58
|
|
v_mov_b32 v200, v34
|
|
v_mov_b32 v34, v22
|
|
v_mov_b32 v22, v200
|
|
; v34 -> v108 & v108 -> v34
|
|
v_mov_b32 v200, v108
|
|
v_mov_b32 v108, v22
|
|
v_mov_b32 v22, v200
|
|
; v108 -> v67 & v67 -> v108
|
|
v_mov_b32 v200, v67
|
|
v_mov_b32 v67, v22
|
|
v_mov_b32 v22, v200
|
|
; v67 -> v96 & v96 -> v67
|
|
v_mov_b32 v200, v96
|
|
v_mov_b32 v96, v22
|
|
v_mov_b32 v22, v200
|
|
; v96 -> v8 & v8 -> v96
|
|
v_mov_b32 v200, v8
|
|
v_mov_b32 v8, v22
|
|
v_mov_b32 v22, v200
|
|
; v26 -> v50 & v50 -> v26
|
|
v_mov_b32 v200, v50
|
|
v_mov_b32 v50, v26
|
|
v_mov_b32 v26, v200
|
|
; v50 -> v44 & v44 -> v50
|
|
v_mov_b32 v200, v44
|
|
v_mov_b32 v44, v26
|
|
v_mov_b32 v26, v200
|
|
; v44 -> v99 & v99 -> v44
|
|
v_mov_b32 v200, v99
|
|
v_mov_b32 v99, v26
|
|
v_mov_b32 v26, v200
|
|
; v99 -> v80 & v80 -> v99
|
|
v_mov_b32 v200, v80
|
|
v_mov_b32 v80, v26
|
|
v_mov_b32 v26, v200
|
|
; v80 -> v88 & v88 -> v80
|
|
v_mov_b32 v200, v88
|
|
v_mov_b32 v88, v26
|
|
v_mov_b32 v26, v200
|
|
; v88 -> v30 & v30 -> v88
|
|
v_mov_b32 v200, v30
|
|
v_mov_b32 v30, v26
|
|
v_mov_b32 v26, v200
|
|
; v30 -> v36 & v36 -> v30
|
|
v_mov_b32 v200, v36
|
|
v_mov_b32 v36, v26
|
|
v_mov_b32 v26, v200
|
|
; v36 -> v109 & v109 -> v36
|
|
v_mov_b32 v200, v109
|
|
v_mov_b32 v109, v26
|
|
v_mov_b32 v26, v200
|
|
; v109 -> v71 & v71 -> v109
|
|
v_mov_b32 v200, v71
|
|
v_mov_b32 v71, v26
|
|
v_mov_b32 v26, v200
|
|
; v71 -> v91 & v91 -> v71
|
|
v_mov_b32 v200, v91
|
|
v_mov_b32 v91, v26
|
|
v_mov_b32 v26, v200
|
|
; v91 -> v22 & v22 -> v91
|
|
v_mov_b32 v200, v22
|
|
v_mov_b32 v22, v26
|
|
v_mov_b32 v26, v200
|
|
; v28 -> v51 & v51 -> v28
|
|
v_mov_b32 v200, v51
|
|
v_mov_b32 v51, v28
|
|
v_mov_b32 v28, v200
|
|
; v51 -> v48 & v48 -> v51
|
|
v_mov_b32 v200, v48
|
|
v_mov_b32 v48, v28
|
|
v_mov_b32 v28, v200
|
|
; v48 -> v104 & v104 -> v48
|
|
v_mov_b32 v200, v104
|
|
v_mov_b32 v104, v28
|
|
v_mov_b32 v28, v200
|
|
; v104 -> v78 & v78 -> v104
|
|
v_mov_b32 v200, v78
|
|
v_mov_b32 v78, v28
|
|
v_mov_b32 v28, v200
|
|
; v78 -> v84 & v84 -> v78
|
|
v_mov_b32 v200, v84
|
|
v_mov_b32 v84, v28
|
|
v_mov_b32 v28, v200
|
|
; v84 -> v29 & v29 -> v84
|
|
v_mov_b32 v200, v29
|
|
v_mov_b32 v29, v28
|
|
v_mov_b32 v28, v200
|
|
; v29 -> v55 & v55 -> v29
|
|
v_mov_b32 v200, v55
|
|
v_mov_b32 v55, v28
|
|
v_mov_b32 v28, v200
|
|
; v55 -> v43 & v43 -> v55
|
|
v_mov_b32 v200, v43
|
|
v_mov_b32 v43, v28
|
|
v_mov_b32 v28, v200
|
|
; v43 -> v102 & v102 -> v43
|
|
v_mov_b32 v200, v102
|
|
v_mov_b32 v102, v28
|
|
v_mov_b32 v28, v200
|
|
; v102 -> v74 & v74 -> v102
|
|
v_mov_b32 v200, v74
|
|
v_mov_b32 v74, v28
|
|
v_mov_b32 v28, v200
|
|
; v74 -> v82 & v82 -> v74
|
|
v_mov_b32 v200, v82
|
|
v_mov_b32 v82, v28
|
|
v_mov_b32 v28, v200
|
|
;NOP 82(28) -> 28
|
|
; v45 -> v103 & v103 -> v45
|
|
v_mov_b32 v200, v103
|
|
v_mov_b32 v103, v45
|
|
v_mov_b32 v45, v200
|
|
; v103 -> v75 & v75 -> v103
|
|
v_mov_b32 v200, v75
|
|
v_mov_b32 v75, v45
|
|
v_mov_b32 v45, v200
|
|
; v75 -> v86 & v86 -> v75
|
|
v_mov_b32 v200, v86
|
|
v_mov_b32 v86, v45
|
|
v_mov_b32 v45, v200
|
|
; v86 -> v26 & v26 -> v86
|
|
v_mov_b32 v200, v26
|
|
v_mov_b32 v26, v45
|
|
v_mov_b32 v45, v200
|
|
; v52 -> v45 & v45 -> v52
|
|
v_mov_b32 v200, v45
|
|
v_mov_b32 v45, v52
|
|
v_mov_b32 v52, v200
|
|
; v214 -> v52 & v52 -> v214
|
|
v_mov_b32 v200, v52
|
|
v_mov_b32 v52, v214
|
|
v_mov_b32 v214, v200
|
|
|
|
|
|
v_dual_mov_b32 v149, v135 :: v_dual_mov_b32 v150, v136
|
|
.LBB0_13: ; %Flow1143
|
|
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
v_dual_mul_f32 v139, s5, v133 :: v_dual_and_b32 v0, 0x60, v0
|
|
v_or_b32_e32 v118, s2, v149
|
|
v_dual_mul_f32 v140, s5, v132 :: v_dual_mul_f32 v141, s5, v131
|
|
s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_add_nc_u32_e32 v0, s3, v0
|
|
v_or_b32_e32 v119, v0, v150
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_mul_lo_u32 v144, v119, s4
|
|
v_add_nc_u32_e32 v0, v118, v144
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_ashrrev_i32_e32 v1, 31, v0
|
|
v_lshlrev_b64 v[0:1], 2, v[0:1]
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
v_add_co_u32 v0, vcc_lo, s0, v0
|
|
v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
|
|
global_load_b128 v[134:137], v[0:1], off
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_mul_f32 v138, s5, v124 :: v_dual_fmac_f32 v141, s6, v137
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
|
|
v_dual_fmac_f32 v138, s6, v134 :: v_dual_add_nc_u32 v145, s4, v144
|
|
v_fmac_f32_e32 v139, s6, v135
|
|
v_mul_f32_e32 v137, s5, v126
|
|
v_dual_mul_f32 v135, s5, v128 :: v_dual_add_nc_u32 v142, v118, v145
|
|
v_fmac_f32_e32 v140, s6, v136
|
|
v_mul_f32_e32 v134, s5, v129
|
|
v_dual_mul_f32 v128, s5, v123 :: v_dual_mul_f32 v129, s5, v122
|
|
s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_1)
|
|
v_ashrrev_i32_e32 v143, 31, v142
|
|
global_store_b128 v[0:1], v[138:141], off
|
|
v_add_nc_u32_e32 v138, s4, v145
|
|
v_mul_f32_e32 v136, s5, v127
|
|
v_lshlrev_b64 v[124:125], 2, v[142:143]
|
|
v_add_co_u32 v124, vcc_lo, s0, v124
|
|
s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
|
|
v_add_co_ci_u32_e32 v125, vcc_lo, s1, v125, vcc_lo
|
|
global_load_b128 v[130:133], v[124:125], off
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v137, s6, v133 :: v_dual_add_nc_u32 v0, v118, v138
|
|
v_ashrrev_i32_e32 v1, 31, v0
|
|
v_dual_fmac_f32 v135, s6, v131 :: v_dual_fmac_f32 v136, s6, v132
|
|
v_dual_mul_f32 v131, s5, v120 :: v_dual_fmac_f32 v134, s6, v130
|
|
s_delay_alu instid0(VALU_DEP_3)
|
|
v_lshlrev_b64 v[0:1], 2, v[0:1]
|
|
global_store_b128 v[124:125], v[134:137], off
|
|
v_add_nc_u32_e32 v134, s4, v138
|
|
v_add_co_u32 v0, vcc_lo, s0, v0
|
|
v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
|
|
global_load_b128 v[124:127], v[0:1], off
|
|
s_waitcnt vmcnt(0)
|
|
v_fmac_f32_e32 v129, s6, v125
|
|
v_dual_mul_f32 v125, s5, v116 :: v_dual_add_nc_u32 v132, v118, v134
|
|
v_fmac_f32_e32 v131, s6, v127
|
|
v_dual_mul_f32 v127, s5, v114 :: v_dual_mul_f32 v130, s5, v121
|
|
v_fmac_f32_e32 v128, s6, v124
|
|
s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
|
v_ashrrev_i32_e32 v133, 31, v132
|
|
v_mul_f32_e32 v124, s5, v117
|
|
v_fmac_f32_e32 v130, s6, v126
|
|
s_delay_alu instid0(VALU_DEP_3)
|
|
v_lshlrev_b64 v[120:121], 2, v[132:133]
|
|
global_store_b128 v[0:1], v[128:131], off
|
|
v_add_nc_u32_e32 v1, 32, v118
|
|
v_add_nc_u32_e32 v0, 0x60, v118
|
|
v_add_co_u32 v132, vcc_lo, s0, v120
|
|
v_add_co_ci_u32_e32 v133, vcc_lo, s1, v121, vcc_lo
|
|
global_load_b128 v[120:123], v[132:133], off
|
|
s_waitcnt vmcnt(0)
|
|
v_fmac_f32_e32 v125, s6, v121
|
|
v_mul_f32_e32 v121, s5, v112
|
|
v_dual_fmac_f32 v127, s6, v123 :: v_dual_add_nc_u32 v128, v1, v144
|
|
v_dual_mul_f32 v123, s5, v110 :: v_dual_mul_f32 v126, s5, v115
|
|
v_fmac_f32_e32 v124, s6, v120
|
|
s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
|
|
v_ashrrev_i32_e32 v129, 31, v128
|
|
v_mul_f32_e32 v120, s5, v113
|
|
v_fmac_f32_e32 v126, s6, v122
|
|
s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
|
|
v_lshlrev_b64 v[114:115], 2, v[128:129]
|
|
global_store_b128 v[132:133], v[124:127], off
|
|
v_add_nc_u32_e32 v124, v1, v145
|
|
v_add_co_u32 v128, vcc_lo, s0, v114
|
|
v_add_co_ci_u32_e32 v129, vcc_lo, s1, v115, vcc_lo
|
|
v_ashrrev_i32_e32 v125, 31, v124
|
|
global_load_b128 v[114:117], v[128:129], off
|
|
s_waitcnt vmcnt(0)
|
|
v_fmac_f32_e32 v123, s6, v117
|
|
v_dual_mul_f32 v117, s5, v106 :: v_dual_mul_f32 v122, s5, v111
|
|
v_lshlrev_b64 v[110:111], 2, v[124:125]
|
|
v_dual_fmac_f32 v120, s6, v114 :: v_dual_fmac_f32 v121, s6, v115
|
|
v_mul_f32_e32 v114, s5, v109
|
|
s_delay_alu instid0(VALU_DEP_4)
|
|
v_fmac_f32_e32 v122, s6, v116
|
|
v_mul_f32_e32 v116, s5, v107
|
|
v_add_co_u32 v124, vcc_lo, s0, v110
|
|
v_add_co_ci_u32_e32 v125, vcc_lo, s1, v111, vcc_lo
|
|
global_store_b128 v[128:129], v[120:123], off
|
|
v_dual_mul_f32 v115, s5, v108 :: v_dual_add_nc_u32 v120, v1, v138
|
|
global_load_b128 v[110:113], v[124:125], off
|
|
v_ashrrev_i32_e32 v121, 31, v120
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_lshlrev_b64 v[106:107], 2, v[120:121]
|
|
v_add_co_u32 v120, vcc_lo, s0, v106
|
|
s_delay_alu instid0(VALU_DEP_2)
|
|
v_add_co_ci_u32_e32 v121, vcc_lo, s1, v107, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v114, s6, v110 :: v_dual_fmac_f32 v115, s6, v111
|
|
v_dual_fmac_f32 v116, s6, v112 :: v_dual_fmac_f32 v117, s6, v113
|
|
v_dual_mul_f32 v112, s5, v103 :: v_dual_mul_f32 v113, s5, v102
|
|
v_dual_mul_f32 v110, s5, v105 :: v_dual_mul_f32 v111, s5, v104
|
|
global_store_b128 v[124:125], v[114:117], off
|
|
global_load_b128 v[106:109], v[120:121], off
|
|
v_add_nc_u32_e32 v114, v1, v134
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_ashrrev_i32_e32 v115, 31, v114
|
|
v_lshlrev_b64 v[102:103], 2, v[114:115]
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
v_add_co_u32 v114, vcc_lo, s0, v102
|
|
v_add_co_ci_u32_e32 v115, vcc_lo, s1, v103, vcc_lo
|
|
v_add_nc_u32_e32 v102, 64, v118
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v110, s6, v106 :: v_dual_fmac_f32 v111, s6, v107
|
|
v_dual_fmac_f32 v112, s6, v108 :: v_dual_fmac_f32 v113, s6, v109
|
|
v_mul_f32_e32 v109, s5, v99
|
|
v_dual_mul_f32 v107, s5, v101 :: v_dual_mul_f32 v108, s5, v100
|
|
global_store_b128 v[120:121], v[110:113], off
|
|
global_load_b128 v[103:106], v[114:115], off
|
|
v_dual_mul_f32 v110, s5, v98 :: v_dual_add_nc_u32 v111, v102, v144
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_ashrrev_i32_e32 v112, 31, v111
|
|
v_lshlrev_b64 v[98:99], 2, v[111:112]
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
v_add_co_u32 v111, vcc_lo, s0, v98
|
|
v_add_co_ci_u32_e32 v112, vcc_lo, s1, v99, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v107, s6, v103 :: v_dual_fmac_f32 v108, s6, v104
|
|
v_dual_fmac_f32 v109, s6, v105 :: v_dual_fmac_f32 v110, s6, v106
|
|
v_dual_mul_f32 v104, s5, v96 :: v_dual_mul_f32 v105, s5, v95
|
|
v_dual_mul_f32 v106, s5, v94 :: v_dual_mul_f32 v103, s5, v97
|
|
global_store_b128 v[114:115], v[107:110], off
|
|
global_load_b128 v[98:101], v[111:112], off
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v104, s6, v99 :: v_dual_mul_f32 v99, s5, v92
|
|
v_add_nc_u32_e32 v107, v102, v145
|
|
v_fmac_f32_e32 v103, s6, v98
|
|
v_dual_fmac_f32 v105, s6, v100 :: v_dual_fmac_f32 v106, s6, v101
|
|
v_mul_f32_e32 v100, s5, v91
|
|
s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
|
|
v_ashrrev_i32_e32 v108, 31, v107
|
|
v_dual_mul_f32 v101, s5, v90 :: v_dual_mul_f32 v98, s5, v93
|
|
global_store_b128 v[111:112], v[103:106], off
|
|
v_add_nc_u32_e32 v103, v102, v138
|
|
v_lshlrev_b64 v[94:95], 2, v[107:108]
|
|
v_ashrrev_i32_e32 v104, 31, v103
|
|
s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
v_add_co_u32 v107, vcc_lo, s0, v94
|
|
v_add_co_ci_u32_e32 v108, vcc_lo, s1, v95, vcc_lo
|
|
s_delay_alu instid0(VALU_DEP_3)
|
|
v_lshlrev_b64 v[90:91], 2, v[103:104]
|
|
global_load_b128 v[94:97], v[107:108], off
|
|
v_add_co_u32 v103, vcc_lo, s0, v90
|
|
v_add_co_ci_u32_e32 v104, vcc_lo, s1, v91, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v98, s6, v94 :: v_dual_fmac_f32 v99, s6, v95
|
|
v_dual_fmac_f32 v100, s6, v96 :: v_dual_fmac_f32 v101, s6, v97
|
|
v_dual_mul_f32 v96, s5, v87 :: v_dual_mul_f32 v97, s5, v86
|
|
v_dual_mul_f32 v94, s5, v89 :: v_dual_mul_f32 v95, s5, v88
|
|
global_store_b128 v[107:108], v[98:101], off
|
|
global_load_b128 v[90:93], v[103:104], off
|
|
v_add_nc_u32_e32 v98, v102, v134
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_ashrrev_i32_e32 v99, 31, v98
|
|
v_lshlrev_b64 v[86:87], 2, v[98:99]
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
v_add_co_u32 v98, vcc_lo, s0, v86
|
|
v_add_co_ci_u32_e32 v99, vcc_lo, s1, v87, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v94, s6, v90 :: v_dual_fmac_f32 v95, s6, v91
|
|
v_dual_fmac_f32 v96, s6, v92 :: v_dual_fmac_f32 v97, s6, v93
|
|
v_dual_mul_f32 v92, s5, v83 :: v_dual_mul_f32 v93, s5, v82
|
|
v_dual_mul_f32 v90, s5, v85 :: v_dual_mul_f32 v91, s5, v84
|
|
global_store_b128 v[103:104], v[94:97], off
|
|
global_load_b128 v[86:89], v[98:99], off
|
|
v_add_nc_u32_e32 v94, v0, v144
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_ashrrev_i32_e32 v95, 31, v94
|
|
v_lshlrev_b64 v[82:83], 2, v[94:95]
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
v_add_co_u32 v94, vcc_lo, s0, v82
|
|
v_add_co_ci_u32_e32 v95, vcc_lo, s1, v83, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v90, s6, v86 :: v_dual_fmac_f32 v91, s6, v87
|
|
v_dual_fmac_f32 v92, s6, v88 :: v_dual_fmac_f32 v93, s6, v89
|
|
v_dual_mul_f32 v87, s5, v80 :: v_dual_mul_f32 v88, s5, v79
|
|
v_dual_mul_f32 v89, s5, v78 :: v_dual_mul_f32 v86, s5, v81
|
|
global_store_b128 v[98:99], v[90:93], off
|
|
global_load_b128 v[82:85], v[94:95], off
|
|
s_waitcnt vmcnt(0)
|
|
v_fmac_f32_e32 v87, s6, v83
|
|
v_dual_mul_f32 v83, s5, v76 :: v_dual_add_nc_u32 v90, v0, v145
|
|
v_fmac_f32_e32 v86, s6, v82
|
|
v_dual_fmac_f32 v88, s6, v84 :: v_dual_fmac_f32 v89, s6, v85
|
|
v_mul_f32_e32 v84, s5, v75
|
|
s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_4) | instid1(VALU_DEP_2)
|
|
v_ashrrev_i32_e32 v91, 31, v90
|
|
v_dual_mul_f32 v85, s5, v74 :: v_dual_mul_f32 v82, s5, v77
|
|
global_store_b128 v[94:95], v[86:89], off
|
|
v_add_nc_u32_e32 v86, v0, v138
|
|
v_lshlrev_b64 v[78:79], 2, v[90:91]
|
|
v_ashrrev_i32_e32 v87, 31, v86
|
|
s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
v_add_co_u32 v90, vcc_lo, s0, v78
|
|
v_add_co_ci_u32_e32 v91, vcc_lo, s1, v79, vcc_lo
|
|
s_delay_alu instid0(VALU_DEP_3)
|
|
v_lshlrev_b64 v[74:75], 2, v[86:87]
|
|
global_load_b128 v[78:81], v[90:91], off
|
|
v_add_co_u32 v86, vcc_lo, s0, v74
|
|
v_add_co_ci_u32_e32 v87, vcc_lo, s1, v75, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v82, s6, v78 :: v_dual_fmac_f32 v83, s6, v79
|
|
v_dual_fmac_f32 v84, s6, v80 :: v_dual_fmac_f32 v85, s6, v81
|
|
v_dual_mul_f32 v80, s5, v71 :: v_dual_mul_f32 v81, s5, v70
|
|
v_dual_mul_f32 v78, s5, v73 :: v_dual_mul_f32 v79, s5, v72
|
|
global_store_b128 v[90:91], v[82:85], off
|
|
global_load_b128 v[74:77], v[86:87], off
|
|
v_add_nc_u32_e32 v82, v0, v134
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_ashrrev_i32_e32 v83, 31, v82
|
|
v_lshlrev_b64 v[70:71], 2, v[82:83]
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
v_add_co_u32 v82, vcc_lo, s0, v70
|
|
v_add_co_ci_u32_e32 v83, vcc_lo, s1, v71, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v78, s6, v74 :: v_dual_fmac_f32 v79, s6, v75
|
|
v_dual_fmac_f32 v80, s6, v76 :: v_dual_fmac_f32 v81, s6, v77
|
|
v_or_b32_e32 v74, 16, v119
|
|
v_dual_mul_f32 v76, s5, v67 :: v_dual_mul_f32 v77, s5, v66
|
|
v_mul_f32_e32 v75, s5, v68
|
|
global_store_b128 v[86:87], v[78:81], off
|
|
global_load_b128 v[70:73], v[82:83], off
|
|
v_mul_lo_u32 v80, v74, s4
|
|
v_mul_f32_e32 v74, s5, v69
|
|
s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_add_nc_u32_e32 v78, v118, v80
|
|
v_ashrrev_i32_e32 v79, 31, v78
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_lshlrev_b64 v[66:67], 2, v[78:79]
|
|
v_add_co_u32 v78, vcc_lo, s0, v66
|
|
s_delay_alu instid0(VALU_DEP_2)
|
|
v_add_co_ci_u32_e32 v79, vcc_lo, s1, v67, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v74, s6, v70 :: v_dual_fmac_f32 v75, s6, v71
|
|
v_dual_fmac_f32 v76, s6, v72 :: v_dual_fmac_f32 v77, s6, v73
|
|
v_or_b32_e32 v70, 17, v119
|
|
v_dual_mul_f32 v72, s5, v63 :: v_dual_mul_f32 v73, s5, v62
|
|
v_mul_f32_e32 v71, s5, v64
|
|
global_store_b128 v[82:83], v[74:77], off
|
|
global_load_b128 v[66:69], v[78:79], off
|
|
v_mul_lo_u32 v76, v70, s4
|
|
v_mul_f32_e32 v70, s5, v65
|
|
s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_add_nc_u32_e32 v74, v118, v76
|
|
v_ashrrev_i32_e32 v75, 31, v74
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_lshlrev_b64 v[62:63], 2, v[74:75]
|
|
v_add_co_u32 v74, vcc_lo, s0, v62
|
|
s_delay_alu instid0(VALU_DEP_2)
|
|
v_add_co_ci_u32_e32 v75, vcc_lo, s1, v63, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v70, s6, v66 :: v_dual_fmac_f32 v71, s6, v67
|
|
v_dual_fmac_f32 v72, s6, v68 :: v_dual_fmac_f32 v73, s6, v69
|
|
v_or_b32_e32 v66, 18, v119
|
|
v_dual_mul_f32 v68, s5, v59 :: v_dual_mul_f32 v69, s5, v58
|
|
v_mul_f32_e32 v67, s5, v60
|
|
global_store_b128 v[78:79], v[70:73], off
|
|
global_load_b128 v[62:65], v[74:75], off
|
|
v_mul_lo_u32 v72, v66, s4
|
|
v_mul_f32_e32 v66, s5, v61
|
|
s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_add_nc_u32_e32 v70, v118, v72
|
|
v_ashrrev_i32_e32 v71, 31, v70
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_lshlrev_b64 v[58:59], 2, v[70:71]
|
|
v_add_co_u32 v70, vcc_lo, s0, v58
|
|
s_delay_alu instid0(VALU_DEP_2)
|
|
v_add_co_ci_u32_e32 v71, vcc_lo, s1, v59, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v66, s6, v62 :: v_dual_fmac_f32 v67, s6, v63
|
|
v_dual_fmac_f32 v68, s6, v64 :: v_dual_fmac_f32 v69, s6, v65
|
|
v_or_b32_e32 v62, 19, v119
|
|
v_dual_mul_f32 v64, s5, v55 :: v_dual_mul_f32 v65, s5, v54
|
|
v_mul_f32_e32 v63, s5, v56
|
|
global_store_b128 v[74:75], v[66:69], off
|
|
global_load_b128 v[58:61], v[70:71], off
|
|
v_mul_lo_u32 v68, v62, s4
|
|
v_mul_f32_e32 v62, s5, v57
|
|
s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_add_nc_u32_e32 v66, v118, v68
|
|
v_ashrrev_i32_e32 v67, 31, v66
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_lshlrev_b64 v[54:55], 2, v[66:67]
|
|
v_add_co_u32 v66, vcc_lo, s0, v54
|
|
s_delay_alu instid0(VALU_DEP_2)
|
|
v_add_co_ci_u32_e32 v67, vcc_lo, s1, v55, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v62, s6, v58 :: v_dual_fmac_f32 v63, s6, v59
|
|
v_dual_fmac_f32 v64, s6, v60 :: v_dual_fmac_f32 v65, s6, v61
|
|
v_dual_mul_f32 v60, s5, v51 :: v_dual_mul_f32 v61, s5, v50
|
|
v_dual_mul_f32 v58, s5, v53 :: v_dual_mul_f32 v59, s5, v52
|
|
global_store_b128 v[70:71], v[62:65], off
|
|
global_load_b128 v[54:57], v[66:67], off
|
|
v_add_nc_u32_e32 v62, v1, v80
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_ashrrev_i32_e32 v63, 31, v62
|
|
v_lshlrev_b64 v[50:51], 2, v[62:63]
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
v_add_co_u32 v62, vcc_lo, s0, v50
|
|
v_add_co_ci_u32_e32 v63, vcc_lo, s1, v51, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v58, s6, v54 :: v_dual_fmac_f32 v59, s6, v55
|
|
v_dual_fmac_f32 v60, s6, v56 :: v_dual_fmac_f32 v61, s6, v57
|
|
v_dual_mul_f32 v56, s5, v47 :: v_dual_mul_f32 v57, s5, v46
|
|
v_dual_mul_f32 v54, s5, v49 :: v_dual_mul_f32 v55, s5, v48
|
|
global_store_b128 v[66:67], v[58:61], off
|
|
global_load_b128 v[50:53], v[62:63], off
|
|
v_add_nc_u32_e32 v58, v1, v76
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_ashrrev_i32_e32 v59, 31, v58
|
|
v_lshlrev_b64 v[46:47], 2, v[58:59]
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
v_add_co_u32 v58, vcc_lo, s0, v46
|
|
v_add_co_ci_u32_e32 v59, vcc_lo, s1, v47, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v54, s6, v50 :: v_dual_fmac_f32 v55, s6, v51
|
|
v_dual_fmac_f32 v56, s6, v52 :: v_dual_fmac_f32 v57, s6, v53
|
|
v_dual_mul_f32 v52, s5, v43 :: v_dual_mul_f32 v53, s5, v42
|
|
v_dual_mul_f32 v50, s5, v45 :: v_dual_mul_f32 v51, s5, v44
|
|
global_store_b128 v[62:63], v[54:57], off
|
|
global_load_b128 v[46:49], v[58:59], off
|
|
v_add_nc_u32_e32 v54, v1, v72
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_ashrrev_i32_e32 v55, 31, v54
|
|
v_lshlrev_b64 v[42:43], 2, v[54:55]
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
v_add_co_u32 v54, vcc_lo, s0, v42
|
|
v_add_co_ci_u32_e32 v55, vcc_lo, s1, v43, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v50, s6, v46 :: v_dual_fmac_f32 v51, s6, v47
|
|
v_dual_fmac_f32 v52, s6, v48 :: v_dual_fmac_f32 v53, s6, v49
|
|
v_dual_mul_f32 v48, s5, v39 :: v_dual_mul_f32 v49, s5, v38
|
|
v_dual_mul_f32 v46, s5, v41 :: v_dual_mul_f32 v47, s5, v40
|
|
global_store_b128 v[58:59], v[50:53], off
|
|
global_load_b128 v[42:45], v[54:55], off
|
|
v_add_nc_u32_e32 v50, v1, v68
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_ashrrev_i32_e32 v51, 31, v50
|
|
v_lshlrev_b64 v[38:39], 2, v[50:51]
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
v_add_co_u32 v50, vcc_lo, s0, v38
|
|
v_add_co_ci_u32_e32 v51, vcc_lo, s1, v39, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v46, s6, v42 :: v_dual_fmac_f32 v47, s6, v43
|
|
v_dual_fmac_f32 v48, s6, v44 :: v_dual_fmac_f32 v49, s6, v45
|
|
v_dual_mul_f32 v44, s5, v35 :: v_dual_mul_f32 v45, s5, v34
|
|
v_dual_mul_f32 v42, s5, v37 :: v_dual_mul_f32 v43, s5, v36
|
|
global_store_b128 v[54:55], v[46:49], off
|
|
global_load_b128 v[38:41], v[50:51], off
|
|
v_add_nc_u32_e32 v46, v102, v80
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_ashrrev_i32_e32 v47, 31, v46
|
|
v_lshlrev_b64 v[34:35], 2, v[46:47]
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
v_add_co_u32 v46, vcc_lo, s0, v34
|
|
v_add_co_ci_u32_e32 v47, vcc_lo, s1, v35, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v42, s6, v38 :: v_dual_fmac_f32 v43, s6, v39
|
|
v_dual_fmac_f32 v44, s6, v40 :: v_dual_fmac_f32 v45, s6, v41
|
|
v_dual_mul_f32 v40, s5, v31 :: v_dual_mul_f32 v41, s5, v30
|
|
v_dual_mul_f32 v38, s5, v33 :: v_dual_mul_f32 v39, s5, v32
|
|
global_store_b128 v[50:51], v[42:45], off
|
|
global_load_b128 v[34:37], v[46:47], off
|
|
v_add_nc_u32_e32 v42, v102, v76
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_ashrrev_i32_e32 v43, 31, v42
|
|
v_lshlrev_b64 v[30:31], 2, v[42:43]
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
v_add_co_u32 v42, vcc_lo, s0, v30
|
|
v_add_co_ci_u32_e32 v43, vcc_lo, s1, v31, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v38, s6, v34 :: v_dual_fmac_f32 v39, s6, v35
|
|
v_dual_fmac_f32 v40, s6, v36 :: v_dual_fmac_f32 v41, s6, v37
|
|
v_dual_mul_f32 v36, s5, v27 :: v_dual_mul_f32 v37, s5, v26
|
|
v_dual_mul_f32 v34, s5, v29 :: v_dual_mul_f32 v35, s5, v28
|
|
global_store_b128 v[46:47], v[38:41], off
|
|
global_load_b128 v[30:33], v[42:43], off
|
|
v_add_nc_u32_e32 v38, v102, v72
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_ashrrev_i32_e32 v39, 31, v38
|
|
v_lshlrev_b64 v[26:27], 2, v[38:39]
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
v_add_co_u32 v38, vcc_lo, s0, v26
|
|
v_add_co_ci_u32_e32 v39, vcc_lo, s1, v27, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v34, s6, v30 :: v_dual_fmac_f32 v35, s6, v31
|
|
v_dual_fmac_f32 v36, s6, v32 :: v_dual_fmac_f32 v37, s6, v33
|
|
v_dual_mul_f32 v32, s5, v23 :: v_dual_mul_f32 v33, s5, v22
|
|
v_dual_mul_f32 v30, s5, v25 :: v_dual_mul_f32 v31, s5, v24
|
|
global_store_b128 v[42:43], v[34:37], off
|
|
global_load_b128 v[26:29], v[38:39], off
|
|
v_add_nc_u32_e32 v34, v102, v68
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_ashrrev_i32_e32 v35, 31, v34
|
|
v_lshlrev_b64 v[22:23], 2, v[34:35]
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
v_add_co_u32 v34, vcc_lo, s0, v22
|
|
v_add_co_ci_u32_e32 v35, vcc_lo, s1, v23, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v30, s6, v26 :: v_dual_fmac_f32 v31, s6, v27
|
|
v_dual_fmac_f32 v32, s6, v28 :: v_dual_fmac_f32 v33, s6, v29
|
|
v_dual_mul_f32 v28, s5, v19 :: v_dual_mul_f32 v29, s5, v18
|
|
v_dual_mul_f32 v26, s5, v21 :: v_dual_mul_f32 v27, s5, v20
|
|
global_store_b128 v[38:39], v[30:33], off
|
|
global_load_b128 v[22:25], v[34:35], off
|
|
v_add_nc_u32_e32 v30, v0, v80
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_ashrrev_i32_e32 v31, 31, v30
|
|
v_lshlrev_b64 v[18:19], 2, v[30:31]
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
v_add_co_u32 v30, vcc_lo, s0, v18
|
|
v_add_co_ci_u32_e32 v31, vcc_lo, s1, v19, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v26, s6, v22 :: v_dual_fmac_f32 v27, s6, v23
|
|
v_dual_fmac_f32 v28, s6, v24 :: v_dual_fmac_f32 v29, s6, v25
|
|
v_dual_mul_f32 v24, s5, v15 :: v_dual_mul_f32 v25, s5, v14
|
|
v_dual_mul_f32 v22, s5, v17 :: v_dual_mul_f32 v23, s5, v16
|
|
global_store_b128 v[34:35], v[26:29], off
|
|
global_load_b128 v[18:21], v[30:31], off
|
|
v_add_nc_u32_e32 v26, v0, v76
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
|
|
v_ashrrev_i32_e32 v27, 31, v26
|
|
v_lshlrev_b64 v[14:15], 2, v[26:27]
|
|
s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
v_add_co_u32 v26, vcc_lo, s0, v14
|
|
v_add_co_ci_u32_e32 v27, vcc_lo, s1, v15, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v22, s6, v18 :: v_dual_fmac_f32 v23, s6, v19
|
|
v_dual_fmac_f32 v24, s6, v20 :: v_dual_fmac_f32 v25, s6, v21
|
|
v_dual_mul_f32 v20, s5, v11 :: v_dual_mul_f32 v21, s5, v10
|
|
v_dual_mul_f32 v18, s5, v13 :: v_dual_mul_f32 v19, s5, v12
|
|
global_store_b128 v[30:31], v[22:25], off
|
|
global_load_b128 v[14:17], v[26:27], off
|
|
v_add_nc_u32_e32 v22, v0, v72
|
|
v_add_nc_u32_e32 v0, v0, v68
|
|
s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
v_ashrrev_i32_e32 v23, 31, v22
|
|
v_ashrrev_i32_e32 v1, 31, v0
|
|
s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
|
|
v_lshlrev_b64 v[10:11], 2, v[22:23]
|
|
v_lshlrev_b64 v[0:1], 2, v[0:1]
|
|
s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
|
|
v_add_co_u32 v22, vcc_lo, s0, v10
|
|
v_add_co_ci_u32_e32 v23, vcc_lo, s1, v11, vcc_lo
|
|
s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
|
|
v_add_co_u32 v0, vcc_lo, s0, v0
|
|
v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v18, s6, v14 :: v_dual_fmac_f32 v19, s6, v15
|
|
v_dual_fmac_f32 v20, s6, v16 :: v_dual_fmac_f32 v21, s6, v17
|
|
v_dual_mul_f32 v14, s5, v9 :: v_dual_mul_f32 v15, s5, v8
|
|
v_dual_mul_f32 v16, s5, v7 :: v_dual_mul_f32 v17, s5, v6
|
|
global_store_b128 v[26:27], v[18:21], off
|
|
global_load_b128 v[10:13], v[22:23], off
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v14, s6, v10 :: v_dual_fmac_f32 v15, s6, v11
|
|
v_dual_fmac_f32 v16, s6, v12 :: v_dual_fmac_f32 v17, s6, v13
|
|
v_dual_mul_f32 v10, s5, v5 :: v_dual_mul_f32 v11, s5, v4
|
|
v_dual_mul_f32 v12, s5, v3 :: v_dual_mul_f32 v13, s5, v2
|
|
global_store_b128 v[22:23], v[14:17], off
|
|
global_load_b128 v[6:9], v[0:1], off
|
|
s_waitcnt vmcnt(0)
|
|
v_dual_fmac_f32 v11, s6, v7 :: v_dual_fmac_f32 v10, s6, v6
|
|
v_dual_fmac_f32 v12, s6, v8 :: v_dual_fmac_f32 v13, s6, v9
|
|
global_store_b128 v[0:1], v[10:13], off
|
|
s_nop 0
|
|
s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
|
|
s_endpgm
|
|
.section .rodata,"a",@progbits
|
|
.p2align 6, 0x0
|
|
.amdhsa_kernel kernel
|
|
.amdhsa_group_segment_fixed_size 8320
|
|
.amdhsa_private_segment_fixed_size 0
|
|
.amdhsa_kernarg_size 36
|
|
.amdhsa_user_sgpr_count 14
|
|
.amdhsa_user_sgpr_dispatch_ptr 0
|
|
.amdhsa_user_sgpr_queue_ptr 0
|
|
.amdhsa_user_sgpr_kernarg_segment_ptr 1
|
|
.amdhsa_user_sgpr_dispatch_id 0
|
|
.amdhsa_user_sgpr_private_segment_size 0
|
|
.amdhsa_wavefront_size32 1
|
|
.amdhsa_uses_dynamic_stack 0
|
|
.amdhsa_enable_private_segment 0
|
|
.amdhsa_system_sgpr_workgroup_id_x 1
|
|
.amdhsa_system_sgpr_workgroup_id_y 1
|
|
.amdhsa_system_sgpr_workgroup_id_z 0
|
|
.amdhsa_system_sgpr_workgroup_info 0
|
|
.amdhsa_system_vgpr_workitem_id 0
|
|
.amdhsa_next_free_vgpr 216
|
|
.amdhsa_next_free_sgpr 16
|
|
.amdhsa_float_round_mode_32 0
|
|
.amdhsa_float_round_mode_16_64 0
|
|
.amdhsa_float_denorm_mode_32 3
|
|
.amdhsa_float_denorm_mode_16_64 3
|
|
.amdhsa_dx10_clamp 1
|
|
.amdhsa_ieee_mode 1
|
|
.amdhsa_fp16_overflow 0
|
|
.amdhsa_workgroup_processor_mode 0
|
|
.amdhsa_memory_ordered 1
|
|
.amdhsa_forward_progress 0
|
|
.amdhsa_shared_vgpr_count 0
|
|
.amdhsa_exception_fp_ieee_invalid_op 0
|
|
.amdhsa_exception_fp_denorm_src 0
|
|
.amdhsa_exception_fp_ieee_div_zero 0
|
|
.amdhsa_exception_fp_ieee_overflow 0
|
|
.amdhsa_exception_fp_ieee_underflow 0
|
|
.amdhsa_exception_fp_ieee_inexact 0
|
|
.amdhsa_exception_int_div_zero 0
|
|
.end_amdhsa_kernel
|
|
.text
|
|
.Lfunc_end0:
|
|
.size kernel, .Lfunc_end0-kernel
|
|
; -- End function
|
|
.section .AMDGPU.csdata,"",@progbits
|
|
; Kernel info:
|
|
; codeLenInByte = 6972
|
|
; NumSgprs: 18
|
|
; NumVgprs: 208
|
|
; ScratchSize: 0
|
|
; MemoryBound: 0
|
|
; FloatMode: 240
|
|
; IeeeMode: 1
|
|
; LDSByteSize: 8320 bytes/workgroup (compile time only)
|
|
; SGPRBlocks: 2
|
|
; VGPRBlocks: 25
|
|
; NumSGPRsForWavesPerEU: 18
|
|
; NumVGPRsForWavesPerEU: 248
|
|
; Occupancy: 7
|
|
; WaveLimiterHint : 0
|
|
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
|
|
; COMPUTE_PGM_RSRC2:USER_SGPR: 14
|
|
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
|
|
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
|
|
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
|
|
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
|
|
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
|
|
.text
|
|
.p2alignl 7, 3214868480
|
|
.fill 96, 4, 3214868480
|
|
.ident "clang version 19.0.0git (git@github.amd.com:Compute-Mirrors/llvm-project b3dbdf4f03718d63a3292f784216fddb3e73d521)"
|
|
.section ".note.GNU-stack","",@progbits
|
|
.addrsig
|
|
.amdgpu_metadata
|
|
---
|
|
amdhsa.kernels:
|
|
- .args:
|
|
- .address_space: global
|
|
.offset: 0
|
|
.size: 8
|
|
.value_kind: global_buffer
|
|
- .address_space: global
|
|
.offset: 8
|
|
.size: 8
|
|
.value_kind: global_buffer
|
|
- .address_space: global
|
|
.offset: 16
|
|
.size: 8
|
|
.value_kind: global_buffer
|
|
- .offset: 24
|
|
.size: 4
|
|
.value_kind: by_value
|
|
- .offset: 28
|
|
.size: 4
|
|
.value_kind: by_value
|
|
- .offset: 32
|
|
.size: 4
|
|
.value_kind: by_value
|
|
.group_segment_fixed_size: 8320
|
|
.kernarg_segment_align: 8
|
|
.kernarg_segment_size: 36
|
|
.language: OpenCL C
|
|
.language_version:
|
|
- 2
|
|
- 0
|
|
.max_flat_workgroup_size: 128
|
|
.name: kernel
|
|
.private_segment_fixed_size: 0
|
|
.sgpr_count: 60
|
|
.sgpr_spill_count: 0
|
|
.symbol: kernel.kd
|
|
.uniform_work_group_size: 1
|
|
.uses_dynamic_stack: false
|
|
.vgpr_count: 216
|
|
.vgpr_spill_count: 0
|
|
.wavefront_size: 32
|
|
.workgroup_processor_mode: 0
|
|
amdhsa.target: amdgcn-amd-amdhsa--gfx1100
|
|
amdhsa.version:
|
|
- 1
|
|
- 2
|
|
...
|
|
|
|
.end_amdgpu_metadata
|
|
|